You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/05/27 00:34:32 UTC
[26/32] incubator-joshua git commit: Merge branch 'master' into
JOSHUA-252 (compiling, but not tested)
Merge branch 'master' into JOSHUA-252 (compiling, but not tested)
# Conflicts:
# src/joshua/decoder/ff/tm/format/MosesFormatReader.java
# src/joshua/decoder/ff/tm/format/PhraseFormatReader.java
# src/main/java/org/apache/joshua/corpus/Vocabulary.java
# src/main/java/org/apache/joshua/decoder/DecoderThread.java
# src/main/java/org/apache/joshua/decoder/JoshuaConfiguration.java
# src/main/java/org/apache/joshua/decoder/ff/TargetBigram.java
# src/main/java/org/apache/joshua/decoder/ff/lm/LanguageModelFF.java
# src/main/java/org/apache/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
# src/main/java/org/apache/joshua/decoder/ff/tm/GrammarReader.java
# src/main/java/org/apache/joshua/decoder/ff/tm/PhraseRule.java
# src/main/java/org/apache/joshua/decoder/ff/tm/format/HieroFormatReader.java
# src/main/java/org/apache/joshua/decoder/ff/tm/format/PhraseFormatReader.java
# src/main/java/org/apache/joshua/decoder/ff/tm/format/SamtFormatReader.java
# src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
# src/main/java/org/apache/joshua/decoder/ff/tm/packed/PackedGrammar.java
# src/main/java/org/apache/joshua/decoder/hypergraph/GrammarBuilderWalkerFunction.java
# src/main/java/org/apache/joshua/decoder/hypergraph/OutputStringExtractor.java
# src/main/java/org/apache/joshua/decoder/phrase/Stacks.java
# src/main/java/org/apache/joshua/decoder/segment_file/Sentence.java
# src/main/java/org/apache/joshua/oracle/OracleExtractionHG.java
# src/main/java/org/apache/joshua/tools/GrammarPacker.java
# src/main/java/org/apache/joshua/util/CompareGrammars.java
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/abb8c518
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/abb8c518
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/abb8c518
Branch: refs/heads/JOSHUA-252
Commit: abb8c5181e85c67679573d6c7538e498a404d8af
Parents: 9d6f84d 4d73c17
Author: Matt Post <po...@cs.jhu.edu>
Authored: Thu May 26 17:44:48 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Thu May 26 17:44:48 2016 -0400
----------------------------------------------------------------------
scripts/support/grammar-packer.pl | 10 +-
scripts/support/moses2joshua_grammar.pl | 13 +-
scripts/support/moses_phrase_to_joshua.pl | 23 ---
scripts/support/run_bundler.py | 1 +
scripts/support/split2files | 44 ++++
scripts/support/splittabs.pl | 42 ----
scripts/training/pipeline.pl | 97 ++-------
scripts/training/split2files.pl | 38 ----
scripts/training/trim_parallel_corpus.pl | 2 +-
.../apache/joshua/corpus/TerminalIterator.java | 4 +-
.../org/apache/joshua/corpus/Vocabulary.java | 16 +-
.../joshua/decoder/JoshuaConfiguration.java | 17 +-
.../joshua/decoder/chart_parser/Chart.java | 2 +-
.../decoder/chart_parser/CubePruneState.java | 4 +-
.../apache/joshua/decoder/ff/TargetBigram.java | 3 +-
.../joshua/decoder/ff/lm/LanguageModelFF.java | 5 +-
.../ff/lm/StateMinimizingLanguageModel.java | 5 +-
.../joshua/decoder/ff/tm/GrammarReader.java | 54 +----
.../apache/joshua/decoder/ff/tm/PhraseRule.java | 94 ---------
.../org/apache/joshua/decoder/ff/tm/Rule.java | 30 +--
.../decoder/ff/tm/format/HieroFormatReader.java | 85 ++++----
.../decoder/ff/tm/format/MosesFormatReader.java | 106 ++++++++++
.../ff/tm/format/PhraseFormatReader.java | 128 ------------
.../decoder/ff/tm/format/SamtFormatReader.java | 136 -------------
.../tm/hash_based/MemoryBasedBatchGrammar.java | 12 +-
.../decoder/ff/tm/packed/PackedGrammar.java | 35 +++-
.../GrammarBuilderWalkerFunction.java | 26 +--
.../joshua/decoder/hypergraph/HyperEdge.java | 11 +-
.../hypergraph/OutputStringExtractor.java | 11 +-
.../joshua/decoder/phrase/PhraseTable.java | 13 +-
.../joshua/oracle/OracleExtractionHG.java | 3 +-
.../org/apache/joshua/tools/GrammarPacker.java | 164 +++++++--------
.../org/apache/joshua/util/CompareGrammars.java | 200 -------------------
.../org/apache/joshua/util/FormatUtils.java | 28 ++-
.../apache/joshua/corpus/VocabularyTest.java | 22 +-
.../org/apache/joshua/util/FormatUtilsTest.java | 6 +-
src/test/resources/decoder/phrase/decode/config | 2 +-
.../decoder/phrase/decode/rules.packed/config | 3 +-
.../decode/rules.packed/slice_00000.features | Bin 4128858 -> 4128858 bytes
.../decode/rules.packed/slice_00000.source | Bin 1982228 -> 1982244 bytes
.../decode/rules.packed/slice_00000.target | Bin 1463856 -> 2652936 bytes
.../rules.packed/slice_00000.target.lookup | Bin 28 -> 32 bytes
.../phrase/decode/rules.packed/vocabulary | Bin 169225 -> 169236 bytes
.../decoder/phrase/include-align-index/test.sh | 17 +-
44 files changed, 436 insertions(+), 1076 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/abb8c518/scripts/training/pipeline.pl
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/abb8c518/src/main/java/org/apache/joshua/corpus/TerminalIterator.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/corpus/TerminalIterator.java
index e82b4cc,0000000..fcf5c72
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/corpus/TerminalIterator.java
+++ b/src/main/java/org/apache/joshua/corpus/TerminalIterator.java
@@@ -1,83 -1,0 +1,85 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.corpus;
+
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
++import org.apache.joshua.util.FormatUtils;
++
+/**
+ * Iterator capable of iterating over those word identifiers in a phrase which represent terminals.
+ * <p>
+ * <em>Note</em>: This class is <em>not</em> thread-safe.
+ *
+ * @author Lane Schwartz
+ */
+public class TerminalIterator implements Iterator<Integer> {
+
+ private final int[] words;
+
+ private int nextIndex = -1;
+ private int next = Integer.MIN_VALUE;
+ private boolean dirty = true;
+
+ /**
+ * Constructs an iterator for the terminals in the given list of words.
+ *
+ * @param words array of words
+ */
+ public TerminalIterator(int[] words) {
+ this.words = words;
+ }
+
+ /* See Javadoc for java.util.Iterator#next(). */
+ public boolean hasNext() {
+
- while (dirty || Vocabulary.nt(next)) {
++ while (dirty || FormatUtils.isNonterminal(next)) {
+ nextIndex++;
+ if (nextIndex < words.length) {
+ next = words[nextIndex];
+ dirty = false;
+ } else {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ /* See Javadoc for java.util.Iterator#next(). */
+ public Integer next() {
+ if (hasNext()) {
+ dirty = true;
+ return next;
+ } else {
+ throw new NoSuchElementException();
+ }
+ }
+
+ /**
+ * Unsupported operation, guaranteed to throw an UnsupportedOperationException.
+ *
+ * @throws UnsupportedOperationException operation not supported yet!
+ */
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/abb8c518/src/main/java/org/apache/joshua/corpus/Vocabulary.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/corpus/Vocabulary.java
index 2bcc447,0000000..8416e4a
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/corpus/Vocabulary.java
+++ b/src/main/java/org/apache/joshua/corpus/Vocabulary.java
@@@ -1,309 -1,0 +1,295 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.corpus;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.Externalizable;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.ObjectInput;
+import java.io.ObjectOutput;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.concurrent.locks.StampedLock;
+
+import org.apache.joshua.decoder.Decoder;
+import org.apache.joshua.decoder.ff.lm.NGramLanguageModel;
+import org.apache.joshua.util.FormatUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Static singular vocabulary class.
+ * Supports (de-)serialization into a vocabulary file.
+ *
+ * @author Juri Ganitkevitch
+ */
+
+public class Vocabulary implements Externalizable {
+
+ private static final Logger LOG = LoggerFactory.getLogger(Vocabulary.class);
+ private final static ArrayList<NGramLanguageModel> LMs = new ArrayList<>();
+
+ private static List<String> idToString;
+ private static Map<String, Integer> stringToId;
+ private static final StampedLock lock = new StampedLock();
+
+ static final int UNKNOWN_ID = 0;
+ static final String UNKNOWN_WORD = "<unk>";
+
+ public static final String START_SYM = "<s>";
+ public static final String STOP_SYM = "</s>";
+
+ static {
+ clear();
+ }
+
+ public static boolean registerLanguageModel(NGramLanguageModel lm) {
+ long lock_stamp = lock.writeLock();
+ try {
+ // Store the language model.
+ LMs.add(lm);
+ // Notify it of all the existing words.
+ boolean collision = false;
+ for (int i = idToString.size() - 1; i > 0; i--)
+ collision = collision || lm.registerWord(idToString.get(i), i);
+ return collision;
+ } finally {
+ lock.unlockWrite(lock_stamp);
+ }
+ }
+
+ /**
+ * Reads a vocabulary from file. This deletes any additions to the vocabulary made prior to
+ * reading the file.
+ *
+ * @param vocab_file path to a vocabulary file
+ * @return Returns true if vocabulary was read without mismatches or collisions.
+ * @throws IOException of the file cannot be found or read properly
+ */
+ public static boolean read(final File vocab_file) throws IOException {
+ DataInputStream vocab_stream =
+ new DataInputStream(new BufferedInputStream(new FileInputStream(vocab_file)));
+ int size = vocab_stream.readInt();
+ LOG.info("Read {} entries from the vocabulary", size);
+ clear();
+ for (int i = 0; i < size; i++) {
+ int id = vocab_stream.readInt();
+ String token = vocab_stream.readUTF();
+ if (id != Math.abs(id(token))) {
+ vocab_stream.close();
+ return false;
+ }
+ }
+ vocab_stream.close();
+ return (size + 1 == idToString.size());
+ }
+
+ public static void write(String file_name) throws IOException {
+ long lock_stamp =lock.readLock();
+ try {
+ File vocab_file = new File(file_name);
+ DataOutputStream vocab_stream =
+ new DataOutputStream(new BufferedOutputStream(new FileOutputStream(vocab_file)));
+ vocab_stream.writeInt(idToString.size() - 1);
+ LOG.info("Writing vocabulary: {} tokens", idToString.size() - 1);
+ for (int i = 1; i < idToString.size(); i++) {
+ vocab_stream.writeInt(i);
+ vocab_stream.writeUTF(idToString.get(i));
+ }
+ vocab_stream.close();
+ }
+ finally{
+ lock.unlockRead(lock_stamp);
+ }
+ }
+
+ /**
+ * Get the id of the token if it already exists, new id is created otherwise.
+ *
+ * TODO: currently locks for every call. Separate constant (frozen) ids from
+ * changing (e.g. OOV) ids. Constant ids could be immutable -> no locking.
+ * Alternatively: could we use ConcurrentHashMap to not have to lock if
+ * actually contains it and only lock for modifications?
+ *
+ * @param token a token to obtain an id for
+ * @return the token id
+ */
+ public static int id(String token) {
+ // First attempt an optimistic read
+ long attempt_read_lock = lock.tryOptimisticRead();
+ if (stringToId.containsKey(token)) {
+ int resultId = stringToId.get(token);
+ if (lock.validate(attempt_read_lock)) {
+ return resultId;
+ }
+ }
+
+ // The optimistic read failed, try a read with a stamped read lock
+ long read_lock_stamp = lock.readLock();
+ try {
+ if (stringToId.containsKey(token)) {
+ return stringToId.get(token);
+ }
+ } finally {
+ lock.unlockRead(read_lock_stamp);
+ }
+
+ // Looks like the id we want is not there, let's get a write lock and add it
+ long write_lock_stamp = lock.writeLock();
+ try {
+ if (stringToId.containsKey(token)) {
+ return stringToId.get(token);
+ }
- int id = idToString.size() * (nt(token) ? -1 : 1);
++ int id = idToString.size() * (FormatUtils.isNonterminal(token) ? -1 : 1);
+
+ // register this (token,id) mapping with each language
+ // model, so that they can map it to their own private
+ // vocabularies
+ for (NGramLanguageModel lm : LMs)
+ lm.registerWord(token, Math.abs(id));
+
+ idToString.add(token);
+ stringToId.put(token, id);
+ return id;
+ } finally {
+ lock.unlockWrite(write_lock_stamp);
+ }
+ }
+
+ public static boolean hasId(int id) {
+ long lock_stamp = lock.readLock();
+ try {
+ id = Math.abs(id);
+ return (id < idToString.size());
+ }
+ finally{
+ lock.unlockRead(lock_stamp);
+ }
+ }
+
+ public static int[] addAll(String sentence) {
+ return addAll(sentence.split("\\s+"));
+ }
+
+ public static int[] addAll(String[] tokens) {
+ int[] ids = new int[tokens.length];
+ for (int i = 0; i < tokens.length; i++)
+ ids[i] = id(tokens[i]);
+ return ids;
+ }
+
+ public static String word(int id) {
+ long lock_stamp = lock.readLock();
+ try {
+ id = Math.abs(id);
+ return idToString.get(id);
+ }
+ finally{
+ lock.unlockRead(lock_stamp);
+ }
+ }
+
+ public static String getWords(int[] ids) {
+ if (ids.length == 0) return "";
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < ids.length - 1; i++)
+ sb.append(word(ids[i])).append(" ");
+ return sb.append(word(ids[ids.length - 1])).toString();
+ }
+
+ public static String getWords(final Iterable<Integer> ids) {
+ StringBuilder sb = new StringBuilder();
+ for (int id : ids)
+ sb.append(word(id)).append(" ");
+ return sb.deleteCharAt(sb.length() - 1).toString();
+ }
+
+ public static int getUnknownId() {
+ return UNKNOWN_ID;
+ }
+
+ public static String getUnknownWord() {
+ return UNKNOWN_WORD;
+ }
+
- /**
- * Returns true if the Vocabulary ID represents a nonterminal.
- *
- * @param id vocabularly ID to check
- * @return true if the Vocabulary ID represents a nonterminal
- */
- public static boolean nt(int id) {
- return (id < 0);
- }
-
- public static boolean nt(String word) {
- return FormatUtils.isNonterminal(word);
- }
-
+ public static int size() {
+ long lock_stamp = lock.readLock();
+ try {
+ return idToString.size();
+ } finally {
+ lock.unlockRead(lock_stamp);
+ }
+ }
+
+ public static synchronized int getTargetNonterminalIndex(int id) {
+ return FormatUtils.getNonterminalIndex(word(id));
+ }
+
+ /**
+ * Clears the vocabulary and initializes it with an unknown word. Registered
+ * language models are left unchanged.
+ */
+ public static void clear() {
+ long lock_stamp = lock.writeLock();
+ try {
+ idToString = new ArrayList<String>();
+ stringToId = new HashMap<String, Integer>();
+
+ idToString.add(UNKNOWN_ID, UNKNOWN_WORD);
+ stringToId.put(UNKNOWN_WORD, UNKNOWN_ID);
+ } finally {
+ lock.unlockWrite(lock_stamp);
+ }
+ }
+
+ public static void unregisterLanguageModels() {
+ LMs.clear();
+ }
+
+ @Override
+ public void writeExternal(ObjectOutput out) throws IOException {
+ // TODO Auto-generated method stub
+
+ }
+
+ @Override
+ public void readExternal(ObjectInput in)
+ throws IOException, ClassNotFoundException {
+ // TODO Auto-generated method stub
+
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if(getClass() == o.getClass()) {
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/abb8c518/src/main/java/org/apache/joshua/decoder/JoshuaConfiguration.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/JoshuaConfiguration.java
index 2bf8e37,0000000..5acfd7e
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/JoshuaConfiguration.java
+++ b/src/main/java/org/apache/joshua/decoder/JoshuaConfiguration.java
@@@ -1,713 -1,0 +1,712 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder;
+
+import static org.apache.joshua.util.FormatUtils.cleanNonTerminal;
- import static org.apache.joshua.util.FormatUtils.markup;
++import static org.apache.joshua.util.FormatUtils.ensureNonTerminalBrackets;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.util.ArrayList;
+import java.util.Collections;
+
+import org.apache.joshua.decoder.ff.StatefulFF;
+import org.apache.joshua.decoder.ff.fragmentlm.Tree;
+import org.apache.joshua.util.FormatUtils;
+import org.apache.joshua.util.Regex;
+import org.apache.joshua.util.io.LineReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Configuration file for Joshua decoder.
+ *
+ * When adding new features to Joshua, any new configurable parameters should be added to this
+ * class.
+ *
+ * @author Zhifei Li, zhifei.work@gmail.com
+ * @author Matt Post post@cs.jhu.edu
+ */
+public class JoshuaConfiguration {
+
+ private static final Logger LOG = LoggerFactory.getLogger(JoshuaConfiguration.class);
+
+ // whether to construct a StructuredTranslation object for each request instead of
+ // printing to stdout. Used when the Decoder is used from Java directly.
+ public Boolean use_structured_output = false;
+
+ // If set to true, Joshua will lowercase the input, creating an annotation that marks the
+ // original case
+ public boolean lowercase = false;
+
+ // If set to true, Joshua will recapitalize the output by projecting the case from aligned
+ // source-side words
+ public boolean project_case = false;
+
+ // List of grammar files to read
+ public ArrayList<String> tms = new ArrayList<String>();
+
+ // A rule cache for commonly used tries to avoid excess object allocations
+ // Testing shows there's up to ~95% hit rate when cache size is 5000 Trie nodes.
+ public Integer cachedRuleSize = new Integer(5000);
+
+ /*
+ * The file to read the weights from (part of the sparse features implementation). Weights can
+ * also just be listed in the main config file.
+ */
+ public String weights_file = "";
+ // Default symbols. The symbol here should be enclosed in square brackets.
- public String default_non_terminal = FormatUtils.markup("X");
-
- public String goal_symbol = FormatUtils.markup("GOAL");
++ public String default_non_terminal = FormatUtils.ensureNonTerminalBrackets("X");
++ public String goal_symbol = FormatUtils.ensureNonTerminalBrackets("GOAL");
+
+ /*
+ * A list of OOV symbols in the form
+ *
+ * [X1] weight [X2] weight [X3] weight ...
+ *
+ * where the [X] symbols are nonterminals and the weights are weights. For each OOV word w in the
+ * input sentence, Joshua will create rules of the form
+ *
+ * X1 -> w (weight)
+ *
+ * If this is empty, an unweighted default_non_terminal is used.
+ */
+ public class OOVItem implements Comparable<OOVItem> {
+ public String label;
+
+ public float weight;
+
+ OOVItem(String l, float w) {
+ label = l;
+ weight = w;
+ }
+ @Override
+ public int compareTo(OOVItem other) {
+ if (weight > other.weight)
+ return -1;
+ else if (weight < other.weight)
+ return 1;
+ return 0;
+ }
+ }
+
+ public ArrayList<OOVItem> oovList = null;
+
+ /*
+ * Whether to segment OOVs into a lattice
+ */
+ public boolean segment_oovs = false;
+
+ /*
+ * Enable lattice decoding.
+ */
+ public boolean lattice_decoding = false;
+
+ /*
+ * If false, sorting of the complete grammar is done at load time. If true, grammar tries are not
+ * sorted till they are first accessed. Amortized sorting means you get your first translation
+ * much, much quicker (good for debugging), but that per-sentence decoding is a bit slower.
+ */
+ public boolean amortized_sorting = true;
+ // syntax-constrained decoding
+ public boolean constrain_parse = false;
+
+ public boolean use_pos_labels = false;
+
+ // oov-specific
+ public boolean true_oovs_only = false;
+
+ /* Dynamic sentence-level filtering. */
+ public boolean filter_grammar = false;
+
+ /* The cube pruning pop limit. Set to 0 for exhaustive pruning. */
+ public int pop_limit = 100;
+
+ /* Maximum sentence length. Sentences longer than this are truncated. */
+ public int maxlen = 200;
+
+ /*
+ * N-best configuration.
+ */
+ // Make sure output strings in the n-best list are unique.
+ public boolean use_unique_nbest = true;
+
+ /* Include the phrasal alignments in the output (not word-level alignmetns at the moment). */
+ public boolean include_align_index = false;
+
+ /* The number of hypotheses to output by default. */
+ public int topN = 1;
+
+ /**
+ * This string describes the format of each line of output from the decoder (i.e., the
+ * translations). The string can include arbitrary text and also variables. The following
+ * variables are available:
+ *
+ * <pre>
+ * - %i the 0-indexed sentence number
+ * - %e the source string %s the translated sentence
+ * - %S the translated sentence with some basic capitalization and denormalization
+ * - %t the synchronous derivation
+ * - %f the list of feature values (as name=value pairs)
+ * - %c the model cost
+ * - %w the weight vector
+ * - %a the alignments between source and target words (currently unimplemented)
+ * - %d a verbose, many-line version of the derivation
+ * </pre>
+ */
+ public String outputFormat = "%i ||| %s ||| %f ||| %c";
+
+ /* The number of decoding threads to use (-threads). */
+ public int num_parallel_decoders = 1;
+
+ // disk hg
+ public String hypergraphFilePattern = "";
+
+ /*
+ * When true, _OOV is appended to all words that are passed through (useful for something like
+ * transliteration on the target side
+ */
+ public boolean mark_oovs = false;
+
+ /* Enables synchronous parsing. */
+ public boolean parse = false; // perform synchronous parsing
+
+
+ /* A list of the feature functions. */
+ public ArrayList<String> features = new ArrayList<String>();
+
+ /* A list of weights found in the main config file (instead of in a separate weights file) */
+ public ArrayList<String> weights = new ArrayList<String>();
+
+ /* Determines whether to expect JSON input or plain lines */
+ public enum INPUT_TYPE { plain, json };
+ public INPUT_TYPE input_type = INPUT_TYPE.plain;
+
+ /* Type of server. Not sure we need to keep the regular TCP one around. */
+ public enum SERVER_TYPE { none, TCP, HTTP };
+ public SERVER_TYPE server_type = SERVER_TYPE.TCP;
+
+ /* If set, Joshua will start a (multi-threaded, per "threads") TCP/IP server on this port. */
+ public int server_port = 0;
+
+ /*
+ * Whether to do forest rescoring. If set to true, the references are expected on STDIN along with
+ * the input sentences in the following format:
+ *
+ * input sentence ||| ||| reference1 ||| reference2 ...
+ *
+ * (The second field is reserved for the output sentence for alignment and forced decoding).
+ */
+
+ public boolean rescoreForest = false;
+ public float rescoreForestWeight = 10.0f;
+
+ /*
+ * Location of fragment mapping file, which maps flattened SCFG rules to their internal
+ * representation.
+ */
+ public String fragmentMapFile = null;
+
+ /*
+ * Whether to use soft syntactic constraint decoding /fuzzy matching, which allows that any
+ * nonterminal may be substituted for any other nonterminal (except for OOV and GOAL)
+ */
+ public boolean fuzzy_matching = false;
+
+ public static final String SOFT_SYNTACTIC_CONSTRAINT_DECODING_PROPERTY_NAME = "fuzzy_matching";
+
+ /***
+ * Phrase-based decoding parameters.
+ */
+
+ /* The search algorithm: currently either "cky" or "stack" */
+ public String search_algorithm = "cky";
+
+ /* The distortion limit */
+ public int reordering_limit = 8;
+
+ /* The number of target sides considered for each source side (after sorting by model weight) */
+ public int num_translation_options = 20;
+
+ /* If true, decode using a dot chart (standard CKY+); if false, use the much more efficient
+ * version of Sennrich (SSST 2014)
+ */
+ public boolean use_dot_chart = true;
+
+ /* Moses compatibility */
+ public boolean moses = false;
+
+ /* If true, just print out the weights found in the config file, and exit. */
+ public boolean show_weights_and_quit = false;
+
+ /* Read input from a file (Moses compatible flag) */
+ public String input_file = null;
+
+ /* Write n-best output to this file */
+ public String n_best_file = null;
+
+ /* Whether to look at source side for special annotations */
+ public boolean source_annotations = false;
+
+ /* Weights overridden from the command line */
+ public String weight_overwrite = "";
+
+ /**
+ * This method resets the state of JoshuaConfiguration back to the state after initialization.
+ * This is useful when for example making different calls to the decoder within the same java
+ * program, which otherwise leads to potential errors due to inconsistent state as a result of
+ * loading the configuration multiple times without resetting etc.
+ *
+ * This leads to the insight that in fact it may be an even better idea to refactor the code and
+ * make JoshuaConfiguration an object that is is created and passed as an argument, rather than a
+ * shared static object. This is just a suggestion for the next step.
+ *
+ */
+ public void reset() {
+ LOG.info("Resetting the JoshuaConfiguration to its defaults ...");
+ LOG.info("\n\tResetting the StatefullFF global state index ...");
+ LOG.info("\n\t...done");
+ StatefulFF.resetGlobalStateIndex();
+ tms = new ArrayList<String>();
+ weights_file = "";
+ default_non_terminal = "[X]";
+ oovList = new ArrayList<OOVItem>();
+ oovList.add(new OOVItem(default_non_terminal, 1.0f));
+ goal_symbol = "[GOAL]";
+ amortized_sorting = true;
+ constrain_parse = false;
+ use_pos_labels = false;
+ true_oovs_only = false;
+ filter_grammar = false;
+ pop_limit = 100;
+ maxlen = 200;
+ use_unique_nbest = false;
+ include_align_index = false;
+ topN = 1;
+ outputFormat = "%i ||| %s ||| %f ||| %c";
+ num_parallel_decoders = 1;
+ hypergraphFilePattern = "";
+ mark_oovs = false;
+ // oracleFile = null;
+ parse = false; // perform synchronous parsing
+ features = new ArrayList<String>();
+ weights = new ArrayList<String>();
+ server_port = 0;
+
+ reordering_limit = 8;
+ num_translation_options = 20;
+ LOG.info("...done");
+ }
+
+ // ===============================================================
+ // Methods
+ // ===============================================================
+
+ /**
+ * To process command-line options, we write them to a file that looks like the config file, and
+ * then call readConfigFile() on it. It would be more general to define a class that sits on a
+ * stream and knows how to chop it up, but this was quicker to implement.
+ *
+ * @param options string array of command line options
+ */
+ public void processCommandLineOptions(String[] options) {
+ try {
+ File tmpFile = File.createTempFile("options", null, null);
+ PrintWriter out = new PrintWriter(new FileWriter(tmpFile));
+
+ for (int i = 0; i < options.length; i++) {
+ String key = options[i].substring(1);
+ if (i + 1 == options.length || options[i + 1].startsWith("-")) {
+ // if this is the last item, or if the next item
+ // is another flag, then this is a boolean flag
+ out.println(key + " = true");
+
+ } else {
+ out.print(key + " =");
+ while (i + 1 < options.length && ! options[i + 1].startsWith("-")) {
+ out.print(String.format(" %s", options[i + 1]));
+ i++;
+ }
+ out.println();
+ }
+ }
+ out.close();
+ this.readConfigFile(tmpFile.getCanonicalPath());
+
+ tmpFile.delete();
+
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public void readConfigFile(String configFile) throws IOException {
+
+ LineReader configReader = new LineReader(configFile, false);
+ try {
+ for (String line : configReader) {
+ line = line.trim(); // .toLowerCase();
+
+ if (Regex.commentOrEmptyLine.matches(line))
+ continue;
+
+ /*
+ * There are two kinds of substantive (non-comment, non-blank) lines: parameters and feature
+ * values. Parameters match the pattern "key = value"; all other substantive lines are
+ * interpreted as features.
+ */
+
+ if (line.indexOf("=") != -1) { // parameters; (not feature function)
+ String[] fds = Regex.equalsWithSpaces.split(line, 2);
+ if (fds.length < 2) {
+ LOG.warn("skipping config file line '{}'", line);
+ continue;
+ }
+
+ String parameter = normalize_key(fds[0]);
+
+ if (parameter.equals(normalize_key("lm"))) {
+ /* This is deprecated. This support old LM lines of the form
+ *
+ * lm = berkeleylm 5 false false 100 lm.gz
+ *
+ * LMs are now loaded as general feature functions, so we transform that to either
+ *
+ * feature-function = LanguageModel -lm_order 5 -lm_type berkeleylm -lm_file lm.gz
+ *
+ * If the line were state minimizing:
+ *
+ * lm = kenlm 5 true false 100 lm.gz
+ *
+ * feature-function = StateMinimizingLanguageModel -lm_order 5 -lm_file lm.gz
+ */
+
+ String[] tokens = fds[1].split("\\s+");
+ if (tokens[2].equals("true"))
+ features.add(String.format("feature_function = StateMinimizingLanguageModel -lm_type kenlm -lm_order %s -lm_file %s",
+ tokens[1], tokens[5]));
+ else
+ features.add(String.format("feature_function = LanguageModel -lm_type %s -lm_order %s -lm_file %s",
+ tokens[0], tokens[1], tokens[5]));
+
+ } else if (parameter.equals(normalize_key("tm"))) {
+ /* If found, convert old format:
+ * tm = TYPE OWNER MAXSPAN PATH
+ * to new format
+ * tm = TYPE -owner OWNER -maxspan MAXSPAN -path PATH
+ */
+ String tmLine = fds[1];
+
+ String[] tokens = fds[1].split("\\s+");
+ if (! tokens[1].startsWith("-")) { // old format
+ tmLine = String.format("%s -owner %s -maxspan %s -path %s", tokens[0], tokens[1], tokens[2], tokens[3]);
+ LOG.warn("Converting deprecated TM line from '{}' -> '{}'", fds[1], tmLine);
+ }
+ tms.add(tmLine);
+
+ } else if (parameter.equals("v")) {
+ Decoder.VERBOSE = Integer.parseInt(fds[1]);
+
+ } else if (parameter.equals(normalize_key("parse"))) {
+ parse = Boolean.parseBoolean(fds[1]);
+ LOG.debug("parse: {}", parse);
+
+ } else if (parameter.equals(normalize_key("dump-hypergraph"))) {
+ hypergraphFilePattern = fds[1].trim();
+ LOG.debug(" hypergraph dump file format: {}", hypergraphFilePattern);
+
+ } else if (parameter.equals(normalize_key("oov-list"))) {
+ if (new File(fds[1]).exists()) {
+ oovList = new ArrayList<OOVItem>();
+ try {
+ File file = new File(fds[1]);
+ BufferedReader br = new BufferedReader(new FileReader(file));
+ try {
+ String str = br.readLine();
+ while (str != null) {
+ String[] tokens = str.trim().split("\\s+");
+
- oovList.add(new OOVItem(FormatUtils.markup(tokens[0]),
- (float) Math.log(Float.parseFloat(tokens[1]))));
++ oovList.add(new OOVItem(FormatUtils.ensureNonTerminalBrackets(tokens[0]),
++ (float) Math.log(Float.parseFloat(tokens[1]))));
+
+ str = br.readLine();
+ }
+ br.close();
+ } catch(IOException e){
+ System.out.println(e);
+ }
+ } catch(IOException e){
+ System.out.println(e);
+ }
+ Collections.sort(oovList);
+
+ } else {
+ String[] tokens = fds[1].trim().split("\\s+");
+ if (tokens.length % 2 != 0) {
+ throw new RuntimeException(String.format("* FATAL: invalid format for '%s'", fds[0]));
+ }
+ oovList = new ArrayList<OOVItem>();
+
+ for (int i = 0; i < tokens.length; i += 2)
- oovList.add(new OOVItem(FormatUtils.markup(tokens[i]),
++ oovList.add(new OOVItem(FormatUtils.ensureNonTerminalBrackets(tokens[i]),
+ (float) Math.log(Float.parseFloat(tokens[i + 1]))));
+
+ Collections.sort(oovList);
+ }
+
+ } else if (parameter.equals(normalize_key("lattice-decoding"))) {
+ lattice_decoding = true;
+
+ } else if (parameter.equals(normalize_key("segment-oovs"))) {
+ segment_oovs = true;
+ lattice_decoding = true;
+
+ } else if (parameter.equals(normalize_key("default-non-terminal"))) {
- default_non_terminal = markup(cleanNonTerminal(fds[1].trim()));
++ default_non_terminal = ensureNonTerminalBrackets(cleanNonTerminal(fds[1].trim()));
+ LOG.debug("default_non_terminal: {}", default_non_terminal);
+
+ } else if (parameter.equals(normalize_key("goal-symbol"))) {
- goal_symbol = markup(cleanNonTerminal(fds[1].trim()));
++ goal_symbol = ensureNonTerminalBrackets(cleanNonTerminal(fds[1].trim()));
+ LOG.debug("goalSymbol: {}", goal_symbol);
+
+ } else if (parameter.equals(normalize_key("weights-file"))) {
+ weights_file = fds[1];
+
+ } else if (parameter.equals(normalize_key("constrain_parse"))) {
+ constrain_parse = Boolean.parseBoolean(fds[1]);
+
+ } else if (parameter.equals(normalize_key("true_oovs_only"))) {
+ true_oovs_only = Boolean.parseBoolean(fds[1]);
+
+ } else if (parameter.equals(normalize_key("filter-grammar"))) {
+ filter_grammar = Boolean.parseBoolean(fds[1]);
+
+ } else if (parameter.equals(normalize_key("amortize"))) {
+ amortized_sorting = Boolean.parseBoolean(fds[1]);
+
+ } else if (parameter.equals(normalize_key("use_pos_labels"))) {
+ use_pos_labels = Boolean.parseBoolean(fds[1]);
+
+ } else if (parameter.equals(normalize_key("use_unique_nbest"))) {
+ use_unique_nbest = Boolean.valueOf(fds[1]);
+ LOG.debug("use_unique_nbest: {}", use_unique_nbest);
+
+ } else if (parameter.equals(normalize_key("output-format"))) {
+ outputFormat = fds[1];
+ LOG.debug("output-format: {}", outputFormat);
+
+ } else if (parameter.equals(normalize_key("include_align_index"))) {
+ include_align_index = Boolean.valueOf(fds[1]);
+ LOG.debug("include_align_index: {}", include_align_index);
+
+ } else if (parameter.equals(normalize_key("top_n"))) {
+ topN = Integer.parseInt(fds[1]);
+ LOG.debug("topN: {}", topN);
+
+ } else if (parameter.equals(normalize_key("num_parallel_decoders"))
+ || parameter.equals(normalize_key("threads"))) {
+ num_parallel_decoders = Integer.parseInt(fds[1]);
+ if (num_parallel_decoders <= 0) {
+ throw new IllegalArgumentException(
+ "Must specify a positive number for num_parallel_decoders");
+ }
+ LOG.debug("num_parallel_decoders: {}", num_parallel_decoders);
+
+ } else if (parameter.equals(normalize_key("mark_oovs"))) {
+ mark_oovs = Boolean.valueOf(fds[1]);
+ LOG.debug("mark_oovs: {}", mark_oovs);
+
+ } else if (parameter.equals(normalize_key("pop-limit"))) {
+ pop_limit = Integer.parseInt(fds[1]);
+ LOG.info("pop-limit: {}", pop_limit);
+
+ } else if (parameter.equals(normalize_key("input-type"))) {
+ if (fds[1].equals("json")) {
+ input_type = INPUT_TYPE.json;
+ } else if (fds[1].equals("plain")) {
+ input_type = INPUT_TYPE.plain;
+ } else {
+ throw new RuntimeException(String.format("* FATAL: invalid server type '%s'", fds[1]));
+ }
+ LOG.info(" input-type: {}", input_type);
+
+ } else if (parameter.equals(normalize_key("server-type"))) {
+ if (fds[1].toLowerCase().equals("tcp"))
+ server_type = SERVER_TYPE.TCP;
+ else if (fds[1].toLowerCase().equals("http"))
+ server_type = SERVER_TYPE.HTTP;
+
+ LOG.info(" server-type: {}", server_type);
+
+ } else if (parameter.equals(normalize_key("server-port"))) {
+ server_port = Integer.parseInt(fds[1]);
+ LOG.info(" server-port: {}", server_port);
+
+ } else if (parameter.equals(normalize_key("rescore-forest"))) {
+ rescoreForest = true;
+ LOG.info(" rescore-forest: {}", rescoreForest);
+
+ } else if (parameter.equals(normalize_key("rescore-forest-weight"))) {
+ rescoreForestWeight = Float.parseFloat(fds[1]);
+ LOG.info(" rescore-forest-weight: {}", rescoreForestWeight);
+
+ } else if (parameter.equals(normalize_key("maxlen"))) {
+ // reset the maximum length
+ maxlen = Integer.parseInt(fds[1]);
+
+ } else if (parameter.equals("c") || parameter.equals("config")) {
+ // this was used to send in the config file, just ignore it
+ ;
+
+ } else if (parameter.equals(normalize_key("feature-function"))) {
+ // add the feature to the list of features for later processing
+ features.add("feature_function = " + fds[1]);
+
+ } else if (parameter.equals(normalize_key("maxlen"))) {
+ // add the feature to the list of features for later processing
+ maxlen = Integer.parseInt(fds[1]);
+
+ } else if (parameter
+ .equals(normalize_key(SOFT_SYNTACTIC_CONSTRAINT_DECODING_PROPERTY_NAME))) {
+ fuzzy_matching = Boolean.parseBoolean(fds[1]);
+ LOG.debug("fuzzy_matching: {}", fuzzy_matching);
+
+ } else if (parameter.equals(normalize_key("fragment-map"))) {
+ fragmentMapFile = fds[1];
+ Tree.readMapping(fragmentMapFile);
+
+ /** PHRASE-BASED PARAMETERS **/
+ } else if (parameter.equals(normalize_key("search"))) {
+ search_algorithm = fds[1];
+
+ if (!search_algorithm.equals("cky") && !search_algorithm.equals("stack")) {
+ throw new RuntimeException(
+ "-search must be one of 'stack' (for phrase-based decoding) " +
+ "or 'cky' (for hierarchical / syntactic decoding)");
+ }
+
+ if (search_algorithm.equals("cky") && include_align_index) {
+ throw new RuntimeException(
+ "include_align_index is currently not supported with cky search");
+ }
+
+ } else if (parameter.equals(normalize_key("reordering-limit"))) {
+ reordering_limit = Integer.parseInt(fds[1]);
+
+ } else if (parameter.equals(normalize_key("num-translation-options"))) {
+ num_translation_options = Integer.parseInt(fds[1]);
+
+ } else if (parameter.equals(normalize_key("no-dot-chart"))) {
+ use_dot_chart = false;
+
+ } else if (parameter.equals(normalize_key("moses"))) {
+ moses = true; // triggers some Moses-specific compatibility options
+
+ } else if (parameter.equals(normalize_key("show-weights"))) {
+ show_weights_and_quit = true;
+
+ } else if (parameter.equals(normalize_key("n-best-list"))) {
+ // for Moses compatibility
+ String[] tokens = fds[1].split("\\s+");
+ n_best_file = tokens[0];
+ if (tokens.length > 1)
+ topN = Integer.parseInt(tokens[1]);
+
+ } else if (parameter.equals(normalize_key("input-file"))) {
+ // for Moses compatibility
+ input_file = fds[1];
+
+ } else if (parameter.equals(normalize_key("weight-file"))) {
+ // for Moses, ignore
+
+ } else if (parameter.equals(normalize_key("weight-overwrite"))) {
+ weight_overwrite = fds[1];
+
+ } else if (parameter.equals(normalize_key("source-annotations"))) {
+ // Check source sentence
+ source_annotations = true;
+
+ } else if (parameter.equals(normalize_key("cached-rules-size"))) {
+ // Check source sentence
+ cachedRuleSize = Integer.parseInt(fds[1]);
+ } else if (parameter.equals(normalize_key("lowercase"))) {
+ lowercase = true;
+
+ } else if (parameter.equals(normalize_key("project-case"))) {
+ project_case = true;
+
+ } else {
+
+ if (parameter.equals(normalize_key("use-sent-specific-tm"))
+ || parameter.equals(normalize_key("add-combined-cost"))
+ || parameter.equals(normalize_key("use-tree-nbest"))
+ || parameter.equals(normalize_key("use-kenlm"))
+ || parameter.equals(normalize_key("useCubePrune"))
+ || parameter.equals(normalize_key("useBeamAndThresholdPrune"))
+ || parameter.equals(normalize_key("regexp-grammar"))) {
+ LOG.warn("ignoring deprecated parameter '{}'", fds[0]);
+
+ } else {
+ throw new RuntimeException("FATAL: unknown configuration parameter '" + fds[0] + "'");
+ }
+ }
+
+ LOG.info(" {} = '{}'", normalize_key(fds[0]), fds[1]);
+
+ } else {
+ /*
+ * Lines that don't have an equals sign and are not blank lines, empty lines, or comments,
+ * are feature values, which can be present in this file
+ */
+
+ weights.add(line);
+ }
+ }
+ } finally {
+ configReader.close();
+ }
+ }
+
+ /**
+ * Checks for invalid variable configurations
+ */
+ public void sanityCheck() {
+ }
+
+ /**
+ * Normalizes parameter names by removing underscores and hyphens and lowercasing. This defines
+ * equivalence classes on external use of parameter names, permitting arbitrary_under_scores and
+ * camelCasing in paramter names without forcing the user to memorize them all. Here are some
+ * examples of equivalent ways to refer to parameter names:
+ * <pre>
+ * {pop-limit, poplimit, PopLimit, popLimit, pop_lim_it} {lmfile, lm-file, LM-FILE, lm_file}
+ * </pre>
+ *
+ * @param text the string to be normalized
+ * @return normalized key
+ *
+ */
+ public static String normalize_key(String text) {
+ return text.replaceAll("[-_]", "").toLowerCase();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/abb8c518/src/main/java/org/apache/joshua/decoder/chart_parser/Chart.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/chart_parser/Chart.java
index 184ae27,0000000..7dd3be9
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/chart_parser/Chart.java
+++ b/src/main/java/org/apache/joshua/decoder/chart_parser/Chart.java
@@@ -1,742 -1,0 +1,742 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.chart_parser;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.PriorityQueue;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.DotChart.DotNode;
+import org.apache.joshua.decoder.ff.FeatureFunction;
+import org.apache.joshua.decoder.ff.SourceDependentFF;
+import org.apache.joshua.decoder.ff.tm.AbstractGrammar;
+import org.apache.joshua.decoder.ff.tm.Grammar;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.ff.tm.RuleCollection;
+import org.apache.joshua.decoder.ff.tm.Trie;
+import org.apache.joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.hypergraph.HyperGraph;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.apache.joshua.decoder.segment_file.Token;
+import org.apache.joshua.lattice.Arc;
+import org.apache.joshua.lattice.Lattice;
+import org.apache.joshua.lattice.Node;
+import org.apache.joshua.util.ChartSpan;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Chart class this class implements chart-parsing: (1) seeding the chart (2)
+ * cky main loop over bins, (3) identify applicable rules in each bin
+ *
+ * Note: the combination operation will be done in Cell
+ *
+ * Signatures of class: Cell: i, j SuperNode (used for CKY check): i,j, lhs
+ * HGNode ("or" node): i,j, lhs, edge ngrams HyperEdge ("and" node)
+ *
+ * index of sentences: start from zero index of cell: cell (i,j) represent span
+ * of words indexed [i,j-1] where i is in [0,n-1] and j is in [1,n]
+ *
+ * @author Zhifei Li, zhifei.work@gmail.com
+ * @author Matt Post post@cs.jhu.edu
+ */
+
+public class Chart {
+
+ private static final Logger LOG = LoggerFactory.getLogger(Chart.class);
+ private final JoshuaConfiguration config;
+ // ===========================================================
+ // Statistics
+ // ===========================================================
+
+ /**
+ * how many items have been pruned away because its cost is greater than the
+ * cutoff in calling chart.add_deduction_in_chart()
+ */
+ int nMerged = 0;
+ int nAdded = 0;
+ int nDotitemAdded = 0; // note: there is no pruning in dot-item
+
+ public Sentence getSentence() {
+ return this.sentence;
+ }
+
+ // ===============================================================
+ // Private instance fields (maybe could be protected instead)
+ // ===============================================================
+ private ChartSpan<Cell> cells; // note that in some cell, it might be null
+ private int sourceLength;
+ private List<FeatureFunction> featureFunctions;
+ private Grammar[] grammars;
+ private DotChart[] dotcharts; // each grammar should have a dotchart associated with it
+ private Cell goalBin;
+ private int goalSymbolID = -1;
+ private Lattice<Token> inputLattice;
+
+ private Sentence sentence = null;
+// private SyntaxTree parseTree;
+// private ManualConstraintsHandler manualConstraintsHandler;
+ private StateConstraint stateConstraint;
+
+
+ // ===============================================================
+ // Constructors
+ // ===============================================================
+
+ /*
+ * TODO: Once the Segment interface is adjusted to provide a Lattice<String>
+ * for the sentence() method, we should just accept a Segment instead of the
+ * sentence, segmentID, and constraintSpans parameters. We have the symbol
+ * table already, so we can do the integerization here instead of in
+ * DecoderThread. GrammarFactory.getGrammarForSentence will want the
+ * integerized sentence as well, but then we'll need to adjust that interface
+ * to deal with (non-trivial) lattices too. Of course, we get passed the
+ * grammars too so we could move all of that into here.
+ */
+
+ public Chart(Sentence sentence, List<FeatureFunction> featureFunctions, Grammar[] grammars,
+ String goalSymbol, JoshuaConfiguration config) {
+ this.config = config;
+ this.inputLattice = sentence.getLattice();
+ this.sourceLength = inputLattice.size() - 1;
+ this.featureFunctions = featureFunctions;
+
+ this.sentence = sentence;
+
+ // TODO: OOV handling no longer handles parse tree input (removed after
+ // commit 748eb69714b26dd67cba8e7c25a294347603bede)
+// this.parseTree = null;
+// if (sentence instanceof ParsedSentence)
+// this.parseTree = ((ParsedSentence) sentence).syntaxTree();
+//
+ this.cells = new ChartSpan<Cell>(sourceLength, null);
+
+ this.goalSymbolID = Vocabulary.id(goalSymbol);
+ this.goalBin = new Cell(this, this.goalSymbolID);
+
+ /* Create the grammars, leaving space for the OOV grammar. */
+ this.grammars = new Grammar[grammars.length + 1];
+ for (int i = 0; i < grammars.length; i++)
+ this.grammars[i + 1] = grammars[i];
+
+ MemoryBasedBatchGrammar oovGrammar = new MemoryBasedBatchGrammar("oov", this.config);
+ AbstractGrammar.addOOVRules(oovGrammar, sentence.getLattice(), featureFunctions,
+ this.config.true_oovs_only);
+ this.grammars[0] = oovGrammar;
+
+ // each grammar will have a dot chart
+ this.dotcharts = new DotChart[this.grammars.length];
+ for (int i = 0; i < this.grammars.length; i++)
+ this.dotcharts[i] = new DotChart(this.inputLattice, this.grammars[i], this,
+ this.grammars[i].isRegexpGrammar());
+
+ // Begin to do initialization work
+
+// manualConstraintsHandler = new ManualConstraintsHandler(this, grammars[grammars.length - 1],
+// sentence.constraints());
+
+ stateConstraint = null;
+ if (sentence.target() != null)
+ // stateConstraint = new StateConstraint(sentence.target());
+ stateConstraint = new StateConstraint(Vocabulary.START_SYM + " " + sentence.target() + " "
+ + Vocabulary.STOP_SYM);
+
+ /* Find the SourceDependent feature and give it access to the sentence. */
+ for (FeatureFunction ff : this.featureFunctions)
+ if (ff instanceof SourceDependentFF)
+ ((SourceDependentFF) ff).setSource(sentence);
+
+ LOG.debug("Finished seeding chart.");
+ }
+
+ /**
+ * Manually set the goal symbol ID. The constructor expects a String
+ * representing the goal symbol, but there may be time (say, for example, in
+ * the second pass of a synchronous parse) where we want to set the goal
+ * symbol to a particular ID (regardless of String representation).
+ * <p>
+ * This method should be called before expanding the chart, as chart expansion
+ * depends on the goal symbol ID.
+ *
+ * @param i the id of the goal symbol to use
+ */
+ public void setGoalSymbolID(int i) {
+ this.goalSymbolID = i;
+ this.goalBin = new Cell(this, i);
+ return;
+ }
+
+ // ===============================================================
+ // The primary method for filling in the chart
+ // ===============================================================
+
+ /**
+ * Construct the hypergraph with the help from DotChart using cube pruning.
+ * Cube pruning occurs at the span level, with all completed rules from the
+ * dot chart competing against each other; that is, rules with different
+ * source sides *and* rules sharing a source side but with different target
+ * sides are all in competition with each other.
+ *
+ * Terminal rules are added to the chart directly.
+ *
+ * Rules with nonterminals are added to the list of candidates. The candidates
+ * list is seeded with the list of all rules and, for each nonterminal in the
+ * rule, the 1-best tail node for that nonterminal and subspan. If the maximum
+ * arity of a rule is R, then the dimension of the hypercube is R + 1, since
+ * the first dimension is used to record the rule.
+ */
+ private void completeSpan(int i, int j) {
+
+ /* STEP 1: create the heap, and seed it with all of the candidate states */
+ PriorityQueue<CubePruneState> candidates = new PriorityQueue<CubePruneState>();
+
+ /*
+ * Look at all the grammars, seeding the chart with completed rules from the
+ * DotChart
+ */
+ for (int g = 0; g < grammars.length; g++) {
+ if (!grammars[g].hasRuleForSpan(i, j, inputLattice.distance(i, j))
+ || null == dotcharts[g].getDotCell(i, j))
+ continue;
+
+ // for each rule with applicable rules
+ for (DotNode dotNode : dotcharts[g].getDotCell(i, j).getDotNodes()) {
+ RuleCollection ruleCollection = dotNode.getRuleCollection();
+ if (ruleCollection == null)
+ continue;
+
+ List<Rule> rules = ruleCollection.getSortedRules(this.featureFunctions);
+ SourcePath sourcePath = dotNode.getSourcePath();
+
+ if (null == rules || rules.size() == 0)
+ continue;
+
+ if (ruleCollection.getArity() == 0) {
+ /*
+ * The total number of arity-0 items (pre-terminal rules) that we add
+ * is controlled by num_translation_options in the configuration.
+ *
+ * We limit the translation options per DotNode; that is, per LHS.
+ */
+ int numTranslationsAdded = 0;
+
+ /* Terminal productions are added directly to the chart */
+ for (Rule rule : rules) {
+
+ if (config.num_translation_options > 0
+ && numTranslationsAdded >= config.num_translation_options) {
+ break;
+ }
+
+ ComputeNodeResult result = new ComputeNodeResult(this.featureFunctions, rule, null, i,
+ j, sourcePath, this.sentence);
+
+ if (stateConstraint == null || stateConstraint.isLegal(result.getDPStates())) {
+ getCell(i, j).addHyperEdgeInCell(result, rule, i, j, null, sourcePath, true);
+ numTranslationsAdded++;
+ }
+ }
+ } else {
+ /* Productions with rank > 0 are subject to cube pruning */
+
+ Rule bestRule = rules.get(0);
+
+ List<HGNode> currentTailNodes = new ArrayList<HGNode>();
+ List<SuperNode> superNodes = dotNode.getAntSuperNodes();
+ for (SuperNode si : superNodes) {
+ currentTailNodes.add(si.nodes.get(0));
+ }
+
+ /*
+ * `ranks` records the current position in the cube. the 0th index is
+ * the rule, and the remaining indices 1..N correspond to the tail
+ * nodes (= nonterminals in the rule). These tail nodes are
+ * represented by SuperNodes, which group together items with the same
+ * nonterminal but different DP state (e.g., language model state)
+ */
+ int[] ranks = new int[1 + superNodes.size()];
+ Arrays.fill(ranks, 1);
+
+ ComputeNodeResult result = new ComputeNodeResult(featureFunctions, bestRule,
+ currentTailNodes, i, j, sourcePath, sentence);
+ CubePruneState bestState = new CubePruneState(result, ranks, rules, currentTailNodes,
+ dotNode);
+ candidates.add(bestState);
+ }
+ }
+ }
+
+ applyCubePruning(i, j, candidates);
+ }
+
+ /**
+ * Applies cube pruning over a span.
+ *
+ * @param i
+ * @param j
+ * @param stateConstraint
+ * @param candidates
+ */
+ private void applyCubePruning(int i, int j, PriorityQueue<CubePruneState> candidates) {
+
+ // System.err.println(String.format("CUBEPRUNE: %d-%d with %d candidates",
+ // i, j, candidates.size()));
+ // for (CubePruneState cand: candidates) {
+ // System.err.println(String.format(" CAND " + cand));
+ // }
+
+ /*
+ * There are multiple ways to reach each point in the cube, so short-circuit
+ * that.
+ */
+ HashSet<CubePruneState> visitedStates = new HashSet<CubePruneState>();
+
+ int popLimit = config.pop_limit;
+ int popCount = 0;
+ while (candidates.size() > 0 && ((++popCount <= popLimit) || popLimit == 0)) {
+ CubePruneState state = candidates.poll();
+
+ DotNode dotNode = state.getDotNode();
+ List<Rule> rules = state.rules;
+ SourcePath sourcePath = dotNode.getSourcePath();
+ List<SuperNode> superNodes = dotNode.getAntSuperNodes();
+
+ /*
+ * Add the hypothesis to the chart. This can only happen if (a) we're not
+ * doing constrained decoding or (b) we are and the state is legal.
+ */
+ if (stateConstraint == null || stateConstraint.isLegal(state.getDPStates())) {
+ getCell(i, j).addHyperEdgeInCell(state.computeNodeResult, state.getRule(), i, j,
+ state.antNodes, sourcePath, true);
+ }
+
+ /*
+ * Expand the hypothesis by walking down a step along each dimension of
+ * the cube, in turn. k = 0 means we extend the rule being used; k > 0
+ * expands the corresponding tail node.
+ */
+
+ for (int k = 0; k < state.ranks.length; k++) {
+
+ /* Copy the current ranks, then extend the one we're looking at. */
+ int[] nextRanks = new int[state.ranks.length];
+ System.arraycopy(state.ranks, 0, nextRanks, 0, state.ranks.length);
+ nextRanks[k]++;
+
+ /*
+ * We might have reached the end of something (list of rules or tail
+ * nodes)
+ */
+ if (k == 0
+ && (nextRanks[k] > rules.size() || (config.num_translation_options > 0 && nextRanks[k] > config.num_translation_options)))
+ continue;
+ else if ((k != 0 && nextRanks[k] > superNodes.get(k - 1).nodes.size()))
+ continue;
+
+ /* Use the updated ranks to assign the next rule and tail node. */
+ Rule nextRule = rules.get(nextRanks[0] - 1);
+ // HGNode[] nextAntNodes = new HGNode[state.antNodes.size()];
- List<HGNode> nextAntNodes = new ArrayList<HGNode>();
++ List<HGNode> nextAntNodes = new ArrayList<HGNode>(state.antNodes.size());
+ for (int x = 0; x < state.ranks.length - 1; x++)
+ nextAntNodes.add(superNodes.get(x).nodes.get(nextRanks[x + 1] - 1));
+
+ /* Create the next state. */
+ CubePruneState nextState = new CubePruneState(new ComputeNodeResult(featureFunctions,
+ nextRule, nextAntNodes, i, j, sourcePath, this.sentence), nextRanks, rules,
+ nextAntNodes, dotNode);
+
+ /* Skip states that have been explored before. */
+ if (visitedStates.contains(nextState))
+ continue;
+
+ visitedStates.add(nextState);
+ candidates.add(nextState);
+ }
+ }
+ }
+
+ /* Create a priority queue of candidates for each span under consideration */
+ private PriorityQueue<CubePruneState>[] allCandidates;
+
+ private ArrayList<SuperNode> nodeStack;
+
+ /**
+ * Translates the sentence using the CKY+ variation proposed in
+ * "A CYK+ Variant for SCFG Decoding Without A Dot Chart" (Sennrich, SSST
+ * 2014).
+ */
+ private int i = -1;
+
+ public HyperGraph expandSansDotChart() {
+ for (i = sourceLength - 1; i >= 0; i--) {
+ allCandidates = new PriorityQueue[sourceLength - i + 2];
+ for (int id = 0; id < allCandidates.length; id++)
+ allCandidates[id] = new PriorityQueue<CubePruneState>();
+
+ nodeStack = new ArrayList<SuperNode>();
+
+ for (int j = i + 1; j <= sourceLength; j++) {
+ if (!sentence.hasPath(i, j))
+ continue;
+
+ for (int g = 0; g < this.grammars.length; g++) {
+ // System.err.println(String.format("\n*** I=%d J=%d GRAMMAR=%d", i, j, g));
+
+ if (j == i + 1) {
+ /* Handle terminals */
+ Node<Token> node = sentence.getNode(i);
+ for (Arc<Token> arc : node.getOutgoingArcs()) {
+ int word = arc.getLabel().getWord();
+ // disallow lattice decoding for now
+ assert arc.getHead().id() == j;
+ Trie trie = this.grammars[g].getTrieRoot().match(word);
+ if (trie != null && trie.hasRules())
+ addToChart(trie, j, false);
+ }
+ } else {
+ /* Recurse for non-terminal case */
+ consume(this.grammars[g].getTrieRoot(), i, j - 1);
+ }
+ }
+
+ // Now that we've accumulated all the candidates, apply cube pruning
+ applyCubePruning(i, j, allCandidates[j - i]);
+
+ // Add unary nodes
+ addUnaryNodes(this.grammars, i, j);
+ }
+ }
+
+ // transition_final: setup a goal item, which may have many deductions
+ if (null == this.cells.get(0, sourceLength)
+ || !this.goalBin.transitToGoal(this.cells.get(0, sourceLength), this.featureFunctions,
+ this.sourceLength)) {
+ LOG.info("Input {}: Parse failure (either no derivations exist or pruning is too aggressive",
+ sentence.id());
+ return null;
+ }
+
+ return new HyperGraph(this.goalBin.getSortedNodes().get(0), -1, -1, this.sentence);
+ }
+
+ /**
+ * Recursively consumes the trie, following input nodes, finding applicable
+ * rules and adding them to bins for each span for later cube pruning.
+ *
+ * @param dotNode data structure containing information about what's been
+ * already matched
+ * @param l extension point we're looking at
+ *
+ */
+ private void consume(Trie trie, int j, int l) {
+ /*
+ * 1. If the trie node has any rules, we can add them to the collection
+ *
+ * 2. Next, look at all the outgoing nonterminal arcs of the trie node. If
+ * any of them match an existing chart item, then we know we can extend
+ * (i,j) to (i,l). We then recurse for all m from l+1 to n (the end of the
+ * sentence)
+ *
+ * 3. We also try to match terminals if (j + 1 == l)
+ */
+
+ // System.err.println(String.format("CONSUME %s / %d %d %d", dotNode,
+ // dotNode.begin(), dotNode.end(), l));
+
+ // Try to match terminals
+ if (inputLattice.distance(j, l) == 1) {
+ // Get the current sentence node, and explore all outgoing arcs, since we
+ // might be decoding
+ // a lattice. For sentence decoding, this is trivial: there is only one
+ // outgoing arc.
+ Node<Token> inputNode = sentence.getNode(j);
+ for (Arc<Token> arc : inputNode.getOutgoingArcs()) {
+ int word = arc.getLabel().getWord();
+ Trie nextTrie;
+ if ((nextTrie = trie.match(word)) != null) {
+ // add to chart item over (i, l)
+ addToChart(nextTrie, arc.getHead().id(), i == j);
+ }
+ }
+ }
+
+ // Now try to match nonterminals
+ Cell cell = cells.get(j, l);
+ if (cell != null) {
+ for (int id : cell.getKeySet()) { // for each supernode (lhs), see if you
+ // can match a trie
+ Trie nextTrie = trie.match(id);
+ if (nextTrie != null) {
+ SuperNode superNode = cell.getSuperNode(id);
+ nodeStack.add(superNode);
+ addToChart(nextTrie, superNode.end(), i == j);
+ nodeStack.remove(nodeStack.size() - 1);
+ }
+ }
+ }
+ }
+
+ /**
+ * Adds all rules at a trie node to the chart, unless its a unary rule. A
+ * unary rule is the first outgoing arc of a grammar's root trie. For
+ * terminals, these are added during the seeding stage; for nonterminals,
+ * these confuse cube pruning and can result in infinite loops, and are
+ * handled separately (see addUnaryNodes());
+ *
+ * @param trie the grammar node
+ * @param isUnary whether the rules at this dotnode are unary
+ */
+ private void addToChart(Trie trie, int j, boolean isUnary) {
+
+ // System.err.println(String.format("ADD TO CHART %s unary=%s", dotNode,
+ // isUnary));
+
+ if (!isUnary && trie.hasRules()) {
+ DotNode dotNode = new DotNode(i, j, trie, new ArrayList<SuperNode>(nodeStack), null);
+
+ addToCandidates(dotNode);
+ }
+
+ for (int l = j + 1; l <= sentence.length(); l++)
+ consume(trie, j, l);
+ }
+
+ /**
+ * Record the completed rule with backpointers for later cube-pruning.
+ *
+ * @param width
+ * @param rules
+ * @param tailNodes
+ */
+ private void addToCandidates(DotNode dotNode) {
+ // System.err.println(String.format("ADD TO CANDIDATES %s AT INDEX %d",
+ // dotNode, dotNode.end() - dotNode.begin()));
+
+ // TODO: one entry per rule, or per rule instantiation (rule together with
+ // unique matching of input)?
+ List<Rule> rules = dotNode.getRuleCollection().getSortedRules(featureFunctions);
+ Rule bestRule = rules.get(0);
+ List<SuperNode> superNodes = dotNode.getAntSuperNodes();
+
+ List<HGNode> tailNodes = new ArrayList<HGNode>();
+ for (SuperNode superNode : superNodes)
+ tailNodes.add(superNode.nodes.get(0));
+
+ int[] ranks = new int[1 + superNodes.size()];
+ Arrays.fill(ranks, 1);
+
+ ComputeNodeResult result = new ComputeNodeResult(featureFunctions, bestRule, tailNodes,
+ dotNode.begin(), dotNode.end(), dotNode.getSourcePath(), sentence);
+ CubePruneState seedState = new CubePruneState(result, ranks, rules, tailNodes, dotNode);
+
+ allCandidates[dotNode.end() - dotNode.begin()].add(seedState);
+ }
+
+ /**
+ * This function performs the main work of decoding.
+ *
+ * @return the hypergraph containing the translated sentence.
+ */
+ public HyperGraph expand() {
+
+ for (int width = 1; width <= sourceLength; width++) {
+ for (int i = 0; i <= sourceLength - width; i++) {
+ int j = i + width;
+ LOG.debug("Processing span ({}, {})", i, j);
+
+ /* Skips spans for which no path exists (possible in lattices). */
+ if (inputLattice.distance(i, j) == Float.POSITIVE_INFINITY) {
+ continue;
+ }
+
+ /*
+ * 1. Expand the dot through all rules. This is a matter of (a) look for
+ * rules over (i,j-1) that need the terminal at (j-1,j) and looking at
+ * all split points k to expand nonterminals.
+ */
+ LOG.debug("Expanding cell");
+ for (int k = 0; k < this.grammars.length; k++) {
+ /**
+ * Each dotChart can act individually (without consulting other
+ * dotCharts) because it either consumes the source input or the
+ * complete nonTerminals, which are both grammar-independent.
+ **/
+ this.dotcharts[k].expandDotCell(i, j);
+ }
+
+ /*
+ * 2. The regular CKY part: add completed items onto the chart via cube
+ * pruning.
+ */
+ LOG.debug("Adding complete items into chart");
+ completeSpan(i, j);
+
+ /* 3. Process unary rules. */
+ LOG.debug("Adding unary items into chart");
+ addUnaryNodes(this.grammars, i, j);
+
+ // (4)=== in dot_cell(i,j), add dot-nodes that start from the /complete/
+ // superIterms in
+ // chart_cell(i,j)
+ LOG.debug("Initializing new dot-items that start from complete items in this cell");
+ for (int k = 0; k < this.grammars.length; k++) {
+ if (this.grammars[k].hasRuleForSpan(i, j, inputLattice.distance(i, j))) {
+ this.dotcharts[k].startDotItems(i, j);
+ }
+ }
+
+ /*
+ * 5. Sort the nodes in the cell.
+ *
+ * Sort the nodes in this span, to make them usable for future
+ * applications of cube pruning.
+ */
+ if (null != this.cells.get(i, j)) {
+ this.cells.get(i, j).getSortedNodes();
+ }
+ }
+ }
+
+ logStatistics();
+
+ // transition_final: setup a goal item, which may have many deductions
+ if (null == this.cells.get(0, sourceLength)
+ || !this.goalBin.transitToGoal(this.cells.get(0, sourceLength), this.featureFunctions,
+ this.sourceLength)) {
+ LOG.info("Input {}: Parse failure (either no derivations exist or pruning is too aggressive",
+ sentence.id());
+ return null;
+ }
+
+ LOG.debug("Finished expand");
+ return new HyperGraph(this.goalBin.getSortedNodes().get(0), -1, -1, this.sentence);
+ }
+
+ /**
+ * Get the requested cell, creating the entry if it doesn't already exist.
+ *
+ * @param i span start
+ * @param j span end
+ * @return the cell item
+ */
+ public Cell getCell(int i, int j) {
+ assert i >= 0;
+ assert i <= sentence.length();
+ assert i <= j;
+ if (cells.get(i, j) == null)
+ cells.set(i, j, new Cell(this, goalSymbolID));
+
+ return cells.get(i, j);
+ }
+
+ // ===============================================================
+ // Private methods
+ // ===============================================================
+
+ private void logStatistics() {
+ LOG.info("Input {}: Chart: added {} merged {} dot-items added: {}",
+ this.sentence.id(), this.nAdded, this.nMerged, this.nDotitemAdded);
+ }
+
+ /**
+ * Handles expansion of unary rules. Rules are expanded in an agenda-based
+ * manner to avoid constructing infinite unary chains. Assumes a triangle
+ * inequality of unary rule expansion (e.g., A -> B will always be cheaper
+ * than A -> C -> B), which is not a true assumption.
+ *
+ * @param grammars A list of the grammars for the sentence
+ * @param i
+ * @param j
+ * @return the number of nodes added
+ */
+ private int addUnaryNodes(Grammar[] grammars, int i, int j) {
+
+ Cell chartBin = this.cells.get(i, j);
+ if (null == chartBin) {
+ return 0;
+ }
+ int qtyAdditionsToQueue = 0;
+ ArrayList<HGNode> queue = new ArrayList<HGNode>(chartBin.getSortedNodes());
+ HashSet<Integer> seen_lhs = new HashSet<Integer>();
+
+ LOG.debug("Adding unary to [{}, {}]", i, j);
+
+ while (queue.size() > 0) {
+ HGNode node = queue.remove(0);
+ seen_lhs.add(node.lhs);
+
+ for (Grammar gr : grammars) {
+ if (!gr.hasRuleForSpan(i, j, inputLattice.distance(i, j)))
+ continue;
+
+ /*
+ * Match against the node's LHS, and then make sure the rule collection
+ * has unary rules
+ */
+ Trie childNode = gr.getTrieRoot().match(node.lhs);
+ if (childNode != null && childNode.getRuleCollection() != null
+ && childNode.getRuleCollection().getArity() == 1) {
+
+ ArrayList<HGNode> antecedents = new ArrayList<HGNode>();
+ antecedents.add(node);
+
+ List<Rule> rules = childNode.getRuleCollection().getSortedRules(this.featureFunctions);
+ for (Rule rule : rules) { // for each unary rules
+
+ ComputeNodeResult states = new ComputeNodeResult(this.featureFunctions, rule,
+ antecedents, i, j, new SourcePath(), this.sentence);
+ HGNode resNode = chartBin.addHyperEdgeInCell(states, rule, i, j, antecedents,
+ new SourcePath(), true);
+
+ LOG.debug("{}", rule);
+ if (null != resNode && !seen_lhs.contains(resNode.lhs)) {
+ queue.add(resNode);
+ qtyAdditionsToQueue++;
+ }
+ }
+ }
+ }
+ }
+ return qtyAdditionsToQueue;
+ }
+
+ /***
+ * Add a terminal production (X -> english phrase) to the hypergraph.
+ *
+ * @param i the start index
+ * @param j stop index
+ * @param rule the terminal rule applied
+ * @param srcPath the source path cost
+ */
+ public void addAxiom(int i, int j, Rule rule, SourcePath srcPath) {
+ if (null == this.cells.get(i, j)) {
+ this.cells.set(i, j, new Cell(this, this.goalSymbolID));
+ }
+
+ this.cells.get(i, j).addHyperEdgeInCell(
+ new ComputeNodeResult(this.featureFunctions, rule, null, i, j, srcPath, sentence), rule, i,
+ j, null, srcPath, false);
+
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/abb8c518/src/main/java/org/apache/joshua/decoder/chart_parser/CubePruneState.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/chart_parser/CubePruneState.java
index 7c2fe5c,0000000..d57a6a2
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/chart_parser/CubePruneState.java
+++ b/src/main/java/org/apache/joshua/decoder/chart_parser/CubePruneState.java
@@@ -1,116 -1,0 +1,114 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.chart_parser;
+
- import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.chart_parser.DotChart.DotNode;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+
+// ===============================================================
+// CubePruneState class
+// ===============================================================
+public class CubePruneState implements Comparable<CubePruneState> {
+ int[] ranks;
+ ComputeNodeResult computeNodeResult;
+ List<HGNode> antNodes;
+ List<Rule> rules;
+ private DotNode dotNode;
+
+ public CubePruneState(ComputeNodeResult score, int[] ranks, List<Rule> rules, List<HGNode> antecedents, DotNode dotNode) {
+ this.computeNodeResult = score;
+ this.ranks = ranks;
+ this.rules = rules;
- // create a new vector is critical, because currentAntecedents will change later
- this.antNodes = new ArrayList<HGNode>(antecedents);
++ this.antNodes = antecedents;
+ this.dotNode = dotNode;
+ }
+
+ /**
+ * This returns the list of DP states associated with the result.
+ *
+ * @return
+ */
+ List<DPState> getDPStates() {
+ return this.computeNodeResult.getDPStates();
+ }
+
+ Rule getRule() {
+ return this.rules.get(this.ranks[0]-1);
+ }
+
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("STATE ||| rule=" + getRule() + " inside cost = " + computeNodeResult.getViterbiCost()
+ + " estimate = " + computeNodeResult.getPruningEstimate());
+ return sb.toString();
+ }
+
+ public void setDotNode(DotNode node) {
+ this.dotNode = node;
+ }
+
+ public DotNode getDotNode() {
+ return this.dotNode;
+ }
+
+ public boolean equals(Object obj) {
+ if (obj == null)
+ return false;
+ if (!this.getClass().equals(obj.getClass()))
+ return false;
+ CubePruneState state = (CubePruneState) obj;
+ if (state.ranks.length != ranks.length)
+ return false;
+ for (int i = 0; i < ranks.length; i++)
+ if (state.ranks[i] != ranks[i])
+ return false;
+ if (getDotNode() != state.getDotNode())
+ return false;
+
+ return true;
+ }
+
+ public int hashCode() {
+ int hash = (dotNode != null) ? dotNode.hashCode() : 0;
+ hash += Arrays.hashCode(ranks);
+
+ return hash;
+ }
+
+ /**
+ * Compares states by ExpectedTotalLogP, allowing states to be sorted according to their inverse
+ * order (high-prob first).
+ */
+ public int compareTo(CubePruneState another) {
+ if (this.computeNodeResult.getPruningEstimate() < another.computeNodeResult
+ .getPruningEstimate()) {
+ return 1;
+ } else if (this.computeNodeResult.getPruningEstimate() == another.computeNodeResult
+ .getPruningEstimate()) {
+ return 0;
+ } else {
+ return -1;
+ }
+ }
+}