You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/05/27 00:34:32 UTC
[26/32] incubator-joshua git commit: Merge branch 'master' into JOSHUA-252 (compiling, but not tested)

Merge branch 'master' into JOSHUA-252 (compiling, but not tested)

# Conflicts:
#	src/joshua/decoder/ff/tm/format/MosesFormatReader.java
#	src/joshua/decoder/ff/tm/format/PhraseFormatReader.java
#	src/main/java/org/apache/joshua/corpus/Vocabulary.java
#	src/main/java/org/apache/joshua/decoder/DecoderThread.java
#	src/main/java/org/apache/joshua/decoder/JoshuaConfiguration.java
#	src/main/java/org/apache/joshua/decoder/ff/TargetBigram.java
#	src/main/java/org/apache/joshua/decoder/ff/lm/LanguageModelFF.java
#	src/main/java/org/apache/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
#	src/main/java/org/apache/joshua/decoder/ff/tm/GrammarReader.java
#	src/main/java/org/apache/joshua/decoder/ff/tm/PhraseRule.java
#	src/main/java/org/apache/joshua/decoder/ff/tm/format/HieroFormatReader.java
#	src/main/java/org/apache/joshua/decoder/ff/tm/format/PhraseFormatReader.java
#	src/main/java/org/apache/joshua/decoder/ff/tm/format/SamtFormatReader.java
#	src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
#	src/main/java/org/apache/joshua/decoder/ff/tm/packed/PackedGrammar.java
#	src/main/java/org/apache/joshua/decoder/hypergraph/GrammarBuilderWalkerFunction.java
#	src/main/java/org/apache/joshua/decoder/hypergraph/OutputStringExtractor.java
#	src/main/java/org/apache/joshua/decoder/phrase/Stacks.java
#	src/main/java/org/apache/joshua/decoder/segment_file/Sentence.java
#	src/main/java/org/apache/joshua/oracle/OracleExtractionHG.java
#	src/main/java/org/apache/joshua/tools/GrammarPacker.java
#	src/main/java/org/apache/joshua/util/CompareGrammars.java


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/abb8c518
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/abb8c518
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/abb8c518

Branch: refs/heads/JOSHUA-252
Commit: abb8c5181e85c67679573d6c7538e498a404d8af
Parents: 9d6f84d 4d73c17
Author: Matt Post <po...@cs.jhu.edu>
Authored: Thu May 26 17:44:48 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Thu May 26 17:44:48 2016 -0400

----------------------------------------------------------------------
 scripts/support/grammar-packer.pl               |  10 +-
 scripts/support/moses2joshua_grammar.pl         |  13 +-
 scripts/support/moses_phrase_to_joshua.pl       |  23 ---
 scripts/support/run_bundler.py                  |   1 +
 scripts/support/split2files                     |  44 ++++
 scripts/support/splittabs.pl                    |  42 ----
 scripts/training/pipeline.pl                    |  97 ++-------
 scripts/training/split2files.pl                 |  38 ----
 scripts/training/trim_parallel_corpus.pl        |   2 +-
 .../apache/joshua/corpus/TerminalIterator.java  |   4 +-
 .../org/apache/joshua/corpus/Vocabulary.java    |  16 +-
 .../joshua/decoder/JoshuaConfiguration.java     |  17 +-
 .../joshua/decoder/chart_parser/Chart.java      |   2 +-
 .../decoder/chart_parser/CubePruneState.java    |   4 +-
 .../apache/joshua/decoder/ff/TargetBigram.java  |   3 +-
 .../joshua/decoder/ff/lm/LanguageModelFF.java   |   5 +-
 .../ff/lm/StateMinimizingLanguageModel.java     |   5 +-
 .../joshua/decoder/ff/tm/GrammarReader.java     |  54 +----
 .../apache/joshua/decoder/ff/tm/PhraseRule.java |  94 ---------
 .../org/apache/joshua/decoder/ff/tm/Rule.java   |  30 +--
 .../decoder/ff/tm/format/HieroFormatReader.java |  85 ++++----
 .../decoder/ff/tm/format/MosesFormatReader.java | 106 ++++++++++
 .../ff/tm/format/PhraseFormatReader.java        | 128 ------------
 .../decoder/ff/tm/format/SamtFormatReader.java  | 136 -------------
 .../tm/hash_based/MemoryBasedBatchGrammar.java  |  12 +-
 .../decoder/ff/tm/packed/PackedGrammar.java     |  35 +++-
 .../GrammarBuilderWalkerFunction.java           |  26 +--
 .../joshua/decoder/hypergraph/HyperEdge.java    |  11 +-
 .../hypergraph/OutputStringExtractor.java       |  11 +-
 .../joshua/decoder/phrase/PhraseTable.java      |  13 +-
 .../joshua/oracle/OracleExtractionHG.java       |   3 +-
 .../org/apache/joshua/tools/GrammarPacker.java  | 164 +++++++--------
 .../org/apache/joshua/util/CompareGrammars.java | 200 -------------------
 .../org/apache/joshua/util/FormatUtils.java     |  28 ++-
 .../apache/joshua/corpus/VocabularyTest.java    |  22 +-
 .../org/apache/joshua/util/FormatUtilsTest.java |   6 +-
 src/test/resources/decoder/phrase/decode/config |   2 +-
 .../decoder/phrase/decode/rules.packed/config   |   3 +-
 .../decode/rules.packed/slice_00000.features    | Bin 4128858 -> 4128858 bytes
 .../decode/rules.packed/slice_00000.source      | Bin 1982228 -> 1982244 bytes
 .../decode/rules.packed/slice_00000.target      | Bin 1463856 -> 2652936 bytes
 .../rules.packed/slice_00000.target.lookup      | Bin 28 -> 32 bytes
 .../phrase/decode/rules.packed/vocabulary       | Bin 169225 -> 169236 bytes
 .../decoder/phrase/include-align-index/test.sh  |  17 +-
 44 files changed, 436 insertions(+), 1076 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/abb8c518/scripts/training/pipeline.pl
----------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/abb8c518/src/main/java/org/apache/joshua/corpus/TerminalIterator.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/corpus/TerminalIterator.java
index e82b4cc,0000000..fcf5c72
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/corpus/TerminalIterator.java
+++ b/src/main/java/org/apache/joshua/corpus/TerminalIterator.java
@@@ -1,83 -1,0 +1,85 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.corpus;
 +
 +import java.util.Iterator;
 +import java.util.NoSuchElementException;
 +
++import org.apache.joshua.util.FormatUtils;
++
 +/**
 + * Iterator capable of iterating over those word identifiers in a phrase which represent terminals.
 + * <p>
 + * <em>Note</em>: This class is <em>not</em> thread-safe.
 + * 
 + * @author Lane Schwartz
 + */
 +public class TerminalIterator implements Iterator<Integer> {
 +
 +  private final int[] words;
 +
 +  private int nextIndex = -1;
 +  private int next = Integer.MIN_VALUE;
 +  private boolean dirty = true;
 +
 +  /**
 +   * Constructs an iterator for the terminals in the given list of words.
 +   * 
 +   * @param words array of words
 +   */
 +  public TerminalIterator(int[] words) {
 +    this.words = words;
 +  }
 +
 +  /* See Javadoc for java.util.Iterator#next(). */
 +  public boolean hasNext() {
 +
-     while (dirty || Vocabulary.nt(next)) {
++    while (dirty || FormatUtils.isNonterminal(next)) {
 +      nextIndex++;
 +      if (nextIndex < words.length) {
 +        next = words[nextIndex];
 +        dirty = false;
 +      } else {
 +        return false;
 +      }
 +    }
 +
 +    return true;
 +  }
 +
 +  /* See Javadoc for java.util.Iterator#next(). */
 +  public Integer next() {
 +    if (hasNext()) {
 +      dirty = true;
 +      return next;
 +    } else {
 +      throw new NoSuchElementException();
 +    }
 +  }
 +
 +  /**
 +   * Unsupported operation, guaranteed to throw an UnsupportedOperationException.
 +   * 
 +   * @throws UnsupportedOperationException operation not supported yet!
 +   */
 +  public void remove() {
 +    throw new UnsupportedOperationException();
 +  }
 +
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/abb8c518/src/main/java/org/apache/joshua/corpus/Vocabulary.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/corpus/Vocabulary.java
index 2bcc447,0000000..8416e4a
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/corpus/Vocabulary.java
+++ b/src/main/java/org/apache/joshua/corpus/Vocabulary.java
@@@ -1,309 -1,0 +1,295 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.corpus;
 +
 +import java.io.BufferedInputStream;
 +import java.io.BufferedOutputStream;
 +import java.io.DataInputStream;
 +import java.io.DataOutputStream;
 +import java.io.Externalizable;
 +import java.io.File;
 +import java.io.FileInputStream;
 +import java.io.FileOutputStream;
 +import java.io.IOException;
 +import java.io.ObjectInput;
 +import java.io.ObjectOutput;
 +import java.util.ArrayList;
 +import java.util.HashMap;
 +import java.util.List;
 +import java.util.Map;
 +import java.util.concurrent.locks.StampedLock;
 +
 +import org.apache.joshua.decoder.Decoder;
 +import org.apache.joshua.decoder.ff.lm.NGramLanguageModel;
 +import org.apache.joshua.util.FormatUtils;
 +import org.slf4j.Logger;
 +import org.slf4j.LoggerFactory;
 +
 +/**
 + * Static singular vocabulary class.
 + * Supports (de-)serialization into a vocabulary file.
 + *
 + * @author Juri Ganitkevitch
 + */
 +
 +public class Vocabulary implements Externalizable {
 +
 +  private static final Logger LOG = LoggerFactory.getLogger(Vocabulary.class);
 +  private final static ArrayList<NGramLanguageModel> LMs = new ArrayList<>();
 +
 +  private static List<String> idToString;
 +  private static Map<String, Integer> stringToId;
 +  private static final StampedLock lock = new StampedLock();
 +
 +  static final int UNKNOWN_ID = 0;
 +  static final String UNKNOWN_WORD = "<unk>";
 +
 +  public static final String START_SYM = "<s>";
 +  public static final String STOP_SYM = "</s>";
 +
 +  static {
 +    clear();
 +  }
 +
 +  public static boolean registerLanguageModel(NGramLanguageModel lm) {
 +    long lock_stamp = lock.writeLock();
 +    try {
 +      // Store the language model.
 +      LMs.add(lm);
 +      // Notify it of all the existing words.
 +      boolean collision = false;
 +      for (int i = idToString.size() - 1; i > 0; i--)
 +        collision = collision || lm.registerWord(idToString.get(i), i);
 +      return collision;
 +    } finally {
 +      lock.unlockWrite(lock_stamp);
 +    }
 +  }
 +
 +  /**
 +   * Reads a vocabulary from file. This deletes any additions to the vocabulary made prior to
 +   * reading the file.
 +   *
 +   * @param vocab_file path to a vocabulary file
 +   * @return Returns true if vocabulary was read without mismatches or collisions.
 +   * @throws IOException of the file cannot be found or read properly
 +   */
 +  public static boolean read(final File vocab_file) throws IOException {
 +    DataInputStream vocab_stream =
 +        new DataInputStream(new BufferedInputStream(new FileInputStream(vocab_file)));
 +    int size = vocab_stream.readInt();
 +    LOG.info("Read {} entries from the vocabulary", size);
 +    clear();
 +    for (int i = 0; i < size; i++) {
 +      int id = vocab_stream.readInt();
 +      String token = vocab_stream.readUTF();
 +      if (id != Math.abs(id(token))) {
 +        vocab_stream.close();
 +        return false;
 +      }
 +    }
 +    vocab_stream.close();
 +    return (size + 1 == idToString.size());
 +  }
 +
 +  public static void write(String file_name) throws IOException {
 +    long lock_stamp =lock.readLock();
 +    try {
 +      File vocab_file = new File(file_name);
 +      DataOutputStream vocab_stream =
 +          new DataOutputStream(new BufferedOutputStream(new FileOutputStream(vocab_file)));
 +      vocab_stream.writeInt(idToString.size() - 1);
 +      LOG.info("Writing vocabulary: {} tokens", idToString.size() - 1);
 +      for (int i = 1; i < idToString.size(); i++) {
 +        vocab_stream.writeInt(i);
 +        vocab_stream.writeUTF(idToString.get(i));
 +      }
 +      vocab_stream.close();
 +    }
 +    finally{
 +      lock.unlockRead(lock_stamp);
 +    }
 +  }
 +
 +  /**
 +   * Get the id of the token if it already exists, new id is created otherwise.
 +   *
 +   * TODO: currently locks for every call. Separate constant (frozen) ids from
 +   * changing (e.g. OOV) ids. Constant ids could be immutable -&gt; no locking.
 +   * Alternatively: could we use ConcurrentHashMap to not have to lock if
 +   * actually contains it and only lock for modifications?
 +   * 
 +   * @param token a token to obtain an id for
 +   * @return the token id
 +   */
 +  public static int id(String token) {
 +    // First attempt an optimistic read
 +    long attempt_read_lock = lock.tryOptimisticRead();
 +    if (stringToId.containsKey(token)) {
 +      int resultId = stringToId.get(token);
 +      if (lock.validate(attempt_read_lock)) {
 +        return resultId;
 +      }
 +    }
 +
 +    // The optimistic read failed, try a read with a stamped read lock
 +    long read_lock_stamp = lock.readLock();
 +    try {
 +      if (stringToId.containsKey(token)) {
 +        return stringToId.get(token);
 +      }
 +    } finally {
 +      lock.unlockRead(read_lock_stamp);
 +    }
 +
 +    // Looks like the id we want is not there, let's get a write lock and add it
 +    long write_lock_stamp = lock.writeLock();
 +    try {
 +      if (stringToId.containsKey(token)) {
 +        return stringToId.get(token);
 +      }
-       int id = idToString.size() * (nt(token) ? -1 : 1);
++      int id = idToString.size() * (FormatUtils.isNonterminal(token) ? -1 : 1);
 +
 +      // register this (token,id) mapping with each language
 +      // model, so that they can map it to their own private
 +      // vocabularies
 +      for (NGramLanguageModel lm : LMs)
 +        lm.registerWord(token, Math.abs(id));
 +
 +      idToString.add(token);
 +      stringToId.put(token, id);
 +      return id;
 +    } finally {
 +      lock.unlockWrite(write_lock_stamp);
 +    }
 +  }
 +
 +  public static boolean hasId(int id) {
 +    long lock_stamp = lock.readLock();
 +    try {
 +      id = Math.abs(id);
 +      return (id < idToString.size());
 +    }
 +    finally{
 +      lock.unlockRead(lock_stamp);
 +    }
 +  }
 +
 +  public static int[] addAll(String sentence) {
 +    return addAll(sentence.split("\\s+"));
 +  }
 +
 +  public static int[] addAll(String[] tokens) {
 +    int[] ids = new int[tokens.length];
 +    for (int i = 0; i < tokens.length; i++)
 +      ids[i] = id(tokens[i]);
 +    return ids;
 +  }
 +
 +  public static String word(int id) {
 +    long lock_stamp = lock.readLock();
 +    try {
 +      id = Math.abs(id);
 +      return idToString.get(id);
 +    }
 +    finally{
 +      lock.unlockRead(lock_stamp);
 +    }
 +  }
 +
 +  public static String getWords(int[] ids) {
 +    if (ids.length == 0) return "";
 +    StringBuilder sb = new StringBuilder();
 +    for (int i = 0; i < ids.length - 1; i++)
 +      sb.append(word(ids[i])).append(" ");
 +    return sb.append(word(ids[ids.length - 1])).toString();
 +  }
 +
 +  public static String getWords(final Iterable<Integer> ids) {
 +    StringBuilder sb = new StringBuilder();
 +    for (int id : ids)
 +      sb.append(word(id)).append(" ");
 +    return sb.deleteCharAt(sb.length() - 1).toString();
 +  }
 +
 +  public static int getUnknownId() {
 +    return UNKNOWN_ID;
 +  }
 +
 +  public static String getUnknownWord() {
 +    return UNKNOWN_WORD;
 +  }
 +
-   /**
-    * Returns true if the Vocabulary ID represents a nonterminal.
-    *
-    * @param id vocabularly ID to check
-    * @return true if the Vocabulary ID represents a nonterminal
-    */
-   public static boolean nt(int id) {
-     return (id < 0);
-   }
- 
-   public static boolean nt(String word) {
-     return FormatUtils.isNonterminal(word);
-   }
- 
 +  public static int size() {
 +    long lock_stamp = lock.readLock();
 +    try {
 +      return idToString.size();
 +    } finally {
 +      lock.unlockRead(lock_stamp);
 +    }
 +  }
 +
 +  public static synchronized int getTargetNonterminalIndex(int id) {
 +    return FormatUtils.getNonterminalIndex(word(id));
 +  }
 +
 +  /**
 +   * Clears the vocabulary and initializes it with an unknown word. Registered
 +   * language models are left unchanged.
 +   */
 +  public static void clear() {
 +    long lock_stamp = lock.writeLock();
 +    try {
 +      idToString = new ArrayList<String>();
 +      stringToId = new HashMap<String, Integer>();
 +
 +      idToString.add(UNKNOWN_ID, UNKNOWN_WORD);
 +      stringToId.put(UNKNOWN_WORD, UNKNOWN_ID);
 +    } finally {
 +      lock.unlockWrite(lock_stamp);
 +    }
 +  }
 +
 +  public static void unregisterLanguageModels() {
 +    LMs.clear();
 +  }
 +
 +  @Override
 +  public void writeExternal(ObjectOutput out) throws IOException {
 +    // TODO Auto-generated method stub
 +
 +  }
 +
 +  @Override
 +  public void readExternal(ObjectInput in)
 +      throws IOException, ClassNotFoundException {
 +    // TODO Auto-generated method stub
 +
 +  }
 +
 +  @Override
 +  public boolean equals(Object o) {
 +    if(getClass() == o.getClass()) {
 +      return true;
 +    } else {
 +      return false;
 +    }
 +  }
 +
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/abb8c518/src/main/java/org/apache/joshua/decoder/JoshuaConfiguration.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/JoshuaConfiguration.java
index 2bf8e37,0000000..5acfd7e
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/JoshuaConfiguration.java
+++ b/src/main/java/org/apache/joshua/decoder/JoshuaConfiguration.java
@@@ -1,713 -1,0 +1,712 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.decoder;
 +
 +import static org.apache.joshua.util.FormatUtils.cleanNonTerminal;
- import static org.apache.joshua.util.FormatUtils.markup;
++import static org.apache.joshua.util.FormatUtils.ensureNonTerminalBrackets;
 +
 +import java.io.File;
 +import java.io.FileWriter;
 +import java.io.IOException;
 +import java.io.PrintWriter;
 +import java.io.BufferedReader;
 +import java.io.FileReader;
 +import java.util.ArrayList;
 +import java.util.Collections;
 +
 +import org.apache.joshua.decoder.ff.StatefulFF;
 +import org.apache.joshua.decoder.ff.fragmentlm.Tree;
 +import org.apache.joshua.util.FormatUtils;
 +import org.apache.joshua.util.Regex;
 +import org.apache.joshua.util.io.LineReader;
 +import org.slf4j.Logger;
 +import org.slf4j.LoggerFactory;
 +
 +/**
 + * Configuration file for Joshua decoder.
 + *
 + * When adding new features to Joshua, any new configurable parameters should be added to this
 + * class.
 + *
 + * @author Zhifei Li, zhifei.work@gmail.com
 + * @author Matt Post post@cs.jhu.edu
 + */
 +public class JoshuaConfiguration {
 +
 +  private static final Logger LOG = LoggerFactory.getLogger(JoshuaConfiguration.class);
 +
 +  // whether to construct a StructuredTranslation object for each request instead of
 +  // printing to stdout. Used when the Decoder is used from Java directly.
 +  public Boolean use_structured_output = false;
 +
 +  // If set to true, Joshua will lowercase the input, creating an annotation that marks the
 +  // original case
 +  public boolean lowercase = false;
 +
 +  // If set to true, Joshua will recapitalize the output by projecting the case from aligned
 +  // source-side words
 +  public boolean project_case = false;
 +
 +  // List of grammar files to read
 +  public ArrayList<String> tms = new ArrayList<String>();
 +
 +  // A rule cache for commonly used tries to avoid excess object allocations
 +  // Testing shows there's up to ~95% hit rate when cache size is 5000 Trie nodes.
 +  public Integer cachedRuleSize = new Integer(5000);
 +
 +  /*
 +   * The file to read the weights from (part of the sparse features implementation). Weights can
 +   * also just be listed in the main config file.
 +   */
 +  public String weights_file = "";
 +  // Default symbols. The symbol here should be enclosed in square brackets.
-   public String default_non_terminal = FormatUtils.markup("X");
- 
-   public String goal_symbol = FormatUtils.markup("GOAL");
++  public String default_non_terminal = FormatUtils.ensureNonTerminalBrackets("X");
++  public String goal_symbol = FormatUtils.ensureNonTerminalBrackets("GOAL");
 +
 +  /*
 +   * A list of OOV symbols in the form
 +   *
 +   * [X1] weight [X2] weight [X3] weight ...
 +   *
 +   * where the [X] symbols are nonterminals and the weights are weights. For each OOV word w in the
 +   * input sentence, Joshua will create rules of the form
 +   *
 +   * X1 -> w (weight)
 +   *
 +   * If this is empty, an unweighted default_non_terminal is used.
 +   */
 +  public class OOVItem implements Comparable<OOVItem> {
 +    public String label;
 +
 +    public float weight;
 +
 +    OOVItem(String l, float w) {
 +      label = l;
 +      weight = w;
 +    }
 +    @Override
 +    public int compareTo(OOVItem other) {
 +      if (weight > other.weight)
 +        return -1;
 +      else if (weight < other.weight)
 +        return 1;
 +      return 0;
 +    }
 +  }
 +
 +  public ArrayList<OOVItem> oovList = null;
 +
 +  /*
 +   * Whether to segment OOVs into a lattice
 +   */
 +  public boolean segment_oovs = false;
 +
 +  /*
 +   * Enable lattice decoding.
 +   */
 +  public boolean lattice_decoding = false;
 +
 +  /*
 +   * If false, sorting of the complete grammar is done at load time. If true, grammar tries are not
 +   * sorted till they are first accessed. Amortized sorting means you get your first translation
 +   * much, much quicker (good for debugging), but that per-sentence decoding is a bit slower.
 +   */
 +  public boolean amortized_sorting = true;
 +  // syntax-constrained decoding
 +  public boolean constrain_parse = false;
 +
 +  public boolean use_pos_labels = false;
 +
 +  // oov-specific
 +  public boolean true_oovs_only = false;
 +
 +  /* Dynamic sentence-level filtering. */
 +  public boolean filter_grammar = false;
 +
 +  /* The cube pruning pop limit. Set to 0 for exhaustive pruning. */
 +  public int pop_limit = 100;
 +
 +  /* Maximum sentence length. Sentences longer than this are truncated. */
 +  public int maxlen = 200;
 +
 +  /*
 +   * N-best configuration.
 +   */
 +  // Make sure output strings in the n-best list are unique.
 +  public boolean use_unique_nbest = true;
 +
 +  /* Include the phrasal alignments in the output (not word-level alignmetns at the moment). */
 +  public boolean include_align_index = false;
 +
 +  /* The number of hypotheses to output by default. */
 +  public int topN = 1;
 +
 +  /**
 +   * This string describes the format of each line of output from the decoder (i.e., the
 +   * translations). The string can include arbitrary text and also variables. The following
 +   * variables are available:
 +   *
 +   * <pre>
 +   * - %i the 0-indexed sentence number
 +   * - %e the source string %s the translated sentence
 +   * - %S the translated sentence with some basic capitalization and denormalization
 +   * - %t the synchronous derivation
 +   * - %f the list of feature values (as name=value pairs)
 +   * - %c the model cost
 +   * - %w the weight vector
 +   * - %a the alignments between source and target words (currently unimplemented)
 +   * - %d a verbose, many-line version of the derivation
 +   * </pre>
 +   */
 +  public String outputFormat = "%i ||| %s ||| %f ||| %c";
 +
 +  /* The number of decoding threads to use (-threads). */
 +  public int num_parallel_decoders = 1;
 +
 +  // disk hg
 +  public String hypergraphFilePattern = "";
 +
 +  /*
 +   * When true, _OOV is appended to all words that are passed through (useful for something like
 +   * transliteration on the target side
 +   */
 +  public boolean mark_oovs = false;
 +
 +  /* Enables synchronous parsing. */
 +  public boolean parse = false; // perform synchronous parsing
 +
 +
 +  /* A list of the feature functions. */
 +  public ArrayList<String> features = new ArrayList<String>();
 +
 +  /* A list of weights found in the main config file (instead of in a separate weights file) */
 +  public ArrayList<String> weights = new ArrayList<String>();
 +
 +  /* Determines whether to expect JSON input or plain lines */
 +  public enum INPUT_TYPE { plain, json };
 +  public INPUT_TYPE input_type = INPUT_TYPE.plain;
 +
 +  /* Type of server. Not sure we need to keep the regular TCP one around. */
 +  public enum SERVER_TYPE { none, TCP, HTTP };
 +  public SERVER_TYPE server_type = SERVER_TYPE.TCP;
 +
 +  /* If set, Joshua will start a (multi-threaded, per "threads") TCP/IP server on this port. */
 +  public int server_port = 0;
 +
 +  /*
 +   * Whether to do forest rescoring. If set to true, the references are expected on STDIN along with
 +   * the input sentences in the following format:
 +   * 
 +   * input sentence ||| ||| reference1 ||| reference2 ...
 +   * 
 +   * (The second field is reserved for the output sentence for alignment and forced decoding).
 +   */
 +
 +  public boolean rescoreForest = false;
 +  public float rescoreForestWeight = 10.0f;
 +
 +  /*
 +   * Location of fragment mapping file, which maps flattened SCFG rules to their internal
 +   * representation.
 +   */
 +  public String fragmentMapFile = null;
 +
 +  /*
 +   * Whether to use soft syntactic constraint decoding /fuzzy matching, which allows that any
 +   * nonterminal may be substituted for any other nonterminal (except for OOV and GOAL)
 +   */
 +  public boolean fuzzy_matching = false;
 +
 +  public static final String SOFT_SYNTACTIC_CONSTRAINT_DECODING_PROPERTY_NAME = "fuzzy_matching";
 +
 +  /***
 +   * Phrase-based decoding parameters.
 +   */
 +  
 +  /* The search algorithm: currently either "cky" or "stack" */
 +  public String search_algorithm = "cky";
 +
 +  /* The distortion limit */
 +  public int reordering_limit = 8;
 +
 +  /* The number of target sides considered for each source side (after sorting by model weight) */
 +  public int num_translation_options = 20;
 +
 +  /* If true, decode using a dot chart (standard CKY+); if false, use the much more efficient
 +   * version of Sennrich (SSST 2014)
 +   */
 +  public boolean use_dot_chart = true;
 +
 +  /* Moses compatibility */
 +  public boolean moses = false;
 +
 +  /* If true, just print out the weights found in the config file, and exit. */
 +  public boolean show_weights_and_quit = false;
 +
 +  /* Read input from a file (Moses compatible flag) */
 +  public String input_file = null;
 +
 +  /* Write n-best output to this file */
 +  public String n_best_file = null;
 +
 +  /* Whether to look at source side for special annotations */
 +  public boolean source_annotations = false;
 +
 +  /* Weights overridden from the command line */
 +  public String weight_overwrite = "";
 +
 +  /**
 +   * This method resets the state of JoshuaConfiguration back to the state after initialization.
 +   * This is useful when for example making different calls to the decoder within the same java
 +   * program, which otherwise leads to potential errors due to inconsistent state as a result of
 +   * loading the configuration multiple times without resetting etc.
 +   *
 +   * This leads to the insight that in fact it may be an even better idea to refactor the code and
 +   * make JoshuaConfiguration an object that is is created and passed as an argument, rather than a
 +   * shared static object. This is just a suggestion for the next step.
 +   *
 +   */
 +  public void reset() {
 +    LOG.info("Resetting the JoshuaConfiguration to its defaults ...");
 +    LOG.info("\n\tResetting the StatefullFF global state index ...");
 +    LOG.info("\n\t...done");
 +    StatefulFF.resetGlobalStateIndex();
 +    tms = new ArrayList<String>();
 +    weights_file = "";
 +    default_non_terminal = "[X]";
 +    oovList = new ArrayList<OOVItem>();
 +    oovList.add(new OOVItem(default_non_terminal, 1.0f));
 +    goal_symbol = "[GOAL]";
 +    amortized_sorting = true;
 +    constrain_parse = false;
 +    use_pos_labels = false;
 +    true_oovs_only = false;
 +    filter_grammar = false;
 +    pop_limit = 100;
 +    maxlen = 200;
 +    use_unique_nbest = false;
 +    include_align_index = false;
 +    topN = 1;
 +    outputFormat = "%i ||| %s ||| %f ||| %c";
 +    num_parallel_decoders = 1;
 +    hypergraphFilePattern = "";
 +    mark_oovs = false;
 +    // oracleFile = null;
 +    parse = false; // perform synchronous parsing
 +    features = new ArrayList<String>();
 +    weights = new ArrayList<String>();
 +    server_port = 0;
 +
 +    reordering_limit = 8;
 +    num_translation_options = 20;
 +    LOG.info("...done");
 +  }
 +
 +  // ===============================================================
 +  // Methods
 +  // ===============================================================
 +
 +  /**
 +   * To process command-line options, we write them to a file that looks like the config file, and
 +   * then call readConfigFile() on it. It would be more general to define a class that sits on a
 +   * stream and knows how to chop it up, but this was quicker to implement.
 +   * 
 +   * @param options string array of command line options
 +   */
 +  public void processCommandLineOptions(String[] options) {
 +    try {
 +      File tmpFile = File.createTempFile("options", null, null);
 +      PrintWriter out = new PrintWriter(new FileWriter(tmpFile));
 +
 +      for (int i = 0; i < options.length; i++) {
 +        String key = options[i].substring(1);
 +        if (i + 1 == options.length || options[i + 1].startsWith("-")) {
 +          // if this is the last item, or if the next item
 +          // is another flag, then this is a boolean flag
 +          out.println(key + " = true");
 +
 +        } else {
 +          out.print(key + " =");
 +          while (i + 1 < options.length && ! options[i + 1].startsWith("-")) {
 +            out.print(String.format(" %s", options[i + 1]));
 +            i++;
 +          }
 +          out.println();
 +        }
 +      }
 +      out.close();
 +      this.readConfigFile(tmpFile.getCanonicalPath());
 +
 +      tmpFile.delete();
 +
 +    } catch (IOException e) {
 +      throw new RuntimeException(e);
 +    }
 +  }
 +
 +  public void readConfigFile(String configFile) throws IOException {
 +
 +    LineReader configReader = new LineReader(configFile, false);
 +    try {
 +      for (String line : configReader) {
 +        line = line.trim(); // .toLowerCase();
 +
 +        if (Regex.commentOrEmptyLine.matches(line))
 +          continue;
 +
 +        /*
 +         * There are two kinds of substantive (non-comment, non-blank) lines: parameters and feature
 +         * values. Parameters match the pattern "key = value"; all other substantive lines are
 +         * interpreted as features.
 +         */
 +
 +        if (line.indexOf("=") != -1) { // parameters; (not feature function)
 +          String[] fds = Regex.equalsWithSpaces.split(line, 2);
 +          if (fds.length < 2) {
 +            LOG.warn("skipping config file line '{}'", line);
 +            continue;
 +          }
 +
 +          String parameter = normalize_key(fds[0]);
 +
 +          if (parameter.equals(normalize_key("lm"))) {
 +            /* This is deprecated. This support old LM lines of the form
 +             * 
 +             *   lm = berkeleylm 5 false false 100 lm.gz
 +             * 
 +             * LMs are now loaded as general feature functions, so we transform that to either
 +             * 
 +             *   feature-function = LanguageModel -lm_order 5 -lm_type berkeleylm -lm_file lm.gz
 +             * 
 +             * If the line were state minimizing:
 +             * 
 +             *   lm = kenlm 5 true false 100 lm.gz
 +             *              
 +             * feature-function = StateMinimizingLanguageModel -lm_order 5 -lm_file lm.gz
 +             */
 +
 +            String[] tokens = fds[1].split("\\s+");
 +            if (tokens[2].equals("true"))
 +              features.add(String.format("feature_function = StateMinimizingLanguageModel -lm_type kenlm -lm_order %s -lm_file %s",
 +                  tokens[1], tokens[5]));
 +            else
 +              features.add(String.format("feature_function = LanguageModel -lm_type %s -lm_order %s -lm_file %s",
 +                  tokens[0], tokens[1], tokens[5]));
 +
 +          } else if (parameter.equals(normalize_key("tm"))) {
 +            /* If found, convert old format:
 +             *   tm = TYPE OWNER MAXSPAN PATH
 +             * to new format
 +             *   tm = TYPE -owner OWNER -maxspan MAXSPAN -path PATH    
 +             */
 +            String tmLine = fds[1];
 +
 +            String[] tokens = fds[1].split("\\s+");
 +            if (! tokens[1].startsWith("-")) { // old format
 +              tmLine = String.format("%s -owner %s -maxspan %s -path %s", tokens[0], tokens[1], tokens[2], tokens[3]);
 +              LOG.warn("Converting deprecated TM line from '{}' -> '{}'", fds[1], tmLine);
 +            }
 +            tms.add(tmLine);
 +
 +          } else if (parameter.equals("v")) {
 +            Decoder.VERBOSE = Integer.parseInt(fds[1]);
 +
 +          } else if (parameter.equals(normalize_key("parse"))) {
 +            parse = Boolean.parseBoolean(fds[1]);
 +            LOG.debug("parse: {}", parse);
 +
 +          } else if (parameter.equals(normalize_key("dump-hypergraph"))) {
 +            hypergraphFilePattern = fds[1].trim();
 +            LOG.debug("  hypergraph dump file format: {}", hypergraphFilePattern);
 +
 +          } else if (parameter.equals(normalize_key("oov-list"))) {
 +            if (new File(fds[1]).exists()) {
 +              oovList = new ArrayList<OOVItem>();
 +              try {
 +                File file = new File(fds[1]);
 +                BufferedReader br = new BufferedReader(new FileReader(file));
 +                try {
 +                  String str = br.readLine();
 +                  while (str != null) {
 +                    String[] tokens = str.trim().split("\\s+");
 +
-                     oovList.add(new OOVItem(FormatUtils.markup(tokens[0]),
-                         (float) Math.log(Float.parseFloat(tokens[1]))));
++                    oovList.add(new OOVItem(FormatUtils.ensureNonTerminalBrackets(tokens[0]),
++                            (float) Math.log(Float.parseFloat(tokens[1]))));
 +
 +                    str = br.readLine();
 +                  }
 +                  br.close();
 +                } catch(IOException e){
 +                  System.out.println(e);
 +                }
 +              } catch(IOException e){
 +                System.out.println(e);
 +              }
 +              Collections.sort(oovList);
 +
 +            } else {
 +              String[] tokens = fds[1].trim().split("\\s+");
 +              if (tokens.length % 2 != 0) {
 +                throw new RuntimeException(String.format("* FATAL: invalid format for '%s'", fds[0]));
 +              }
 +              oovList = new ArrayList<OOVItem>();
 +
 +              for (int i = 0; i < tokens.length; i += 2)
-                 oovList.add(new OOVItem(FormatUtils.markup(tokens[i]),
++                oovList.add(new OOVItem(FormatUtils.ensureNonTerminalBrackets(tokens[i]),
 +                    (float) Math.log(Float.parseFloat(tokens[i + 1]))));
 +
 +              Collections.sort(oovList);
 +            }
 +
 +          } else if (parameter.equals(normalize_key("lattice-decoding"))) {
 +            lattice_decoding = true;
 +
 +          } else if (parameter.equals(normalize_key("segment-oovs"))) {
 +            segment_oovs = true;
 +            lattice_decoding = true;
 +
 +          } else if (parameter.equals(normalize_key("default-non-terminal"))) {
-             default_non_terminal = markup(cleanNonTerminal(fds[1].trim()));
++            default_non_terminal = ensureNonTerminalBrackets(cleanNonTerminal(fds[1].trim()));
 +            LOG.debug("default_non_terminal: {}", default_non_terminal);
 +
 +          } else if (parameter.equals(normalize_key("goal-symbol"))) {
-             goal_symbol = markup(cleanNonTerminal(fds[1].trim()));
++            goal_symbol = ensureNonTerminalBrackets(cleanNonTerminal(fds[1].trim()));
 +            LOG.debug("goalSymbol: {}", goal_symbol);
 +
 +          } else if (parameter.equals(normalize_key("weights-file"))) {
 +            weights_file = fds[1];
 +
 +          } else if (parameter.equals(normalize_key("constrain_parse"))) {
 +            constrain_parse = Boolean.parseBoolean(fds[1]);
 +
 +          } else if (parameter.equals(normalize_key("true_oovs_only"))) {
 +            true_oovs_only = Boolean.parseBoolean(fds[1]);
 +
 +          } else if (parameter.equals(normalize_key("filter-grammar"))) {
 +            filter_grammar = Boolean.parseBoolean(fds[1]);
 +
 +          } else if (parameter.equals(normalize_key("amortize"))) {
 +            amortized_sorting = Boolean.parseBoolean(fds[1]);
 +
 +          } else if (parameter.equals(normalize_key("use_pos_labels"))) {
 +            use_pos_labels = Boolean.parseBoolean(fds[1]);
 +
 +          } else if (parameter.equals(normalize_key("use_unique_nbest"))) {
 +            use_unique_nbest = Boolean.valueOf(fds[1]);
 +            LOG.debug("use_unique_nbest: {}", use_unique_nbest);
 +
 +          } else if (parameter.equals(normalize_key("output-format"))) {
 +            outputFormat = fds[1];
 +            LOG.debug("output-format: {}", outputFormat);
 +
 +          } else if (parameter.equals(normalize_key("include_align_index"))) {
 +            include_align_index = Boolean.valueOf(fds[1]);
 +            LOG.debug("include_align_index: {}", include_align_index);
 +
 +          } else if (parameter.equals(normalize_key("top_n"))) {
 +            topN = Integer.parseInt(fds[1]);
 +            LOG.debug("topN: {}", topN);
 +
 +          } else if (parameter.equals(normalize_key("num_parallel_decoders"))
 +              || parameter.equals(normalize_key("threads"))) {
 +            num_parallel_decoders = Integer.parseInt(fds[1]);
 +            if (num_parallel_decoders <= 0) {
 +              throw new IllegalArgumentException(
 +                  "Must specify a positive number for num_parallel_decoders");
 +            }
 +            LOG.debug("num_parallel_decoders: {}", num_parallel_decoders);
 +
 +          } else if (parameter.equals(normalize_key("mark_oovs"))) {
 +            mark_oovs = Boolean.valueOf(fds[1]);
 +            LOG.debug("mark_oovs: {}", mark_oovs);
 +
 +          } else if (parameter.equals(normalize_key("pop-limit"))) {
 +            pop_limit = Integer.parseInt(fds[1]);
 +            LOG.info("pop-limit: {}", pop_limit);
 +
 +          } else if (parameter.equals(normalize_key("input-type"))) {
 +            if (fds[1].equals("json")) {
 +              input_type = INPUT_TYPE.json;
 +            } else if (fds[1].equals("plain")) {
 +              input_type = INPUT_TYPE.plain;
 +            } else {
 +              throw new RuntimeException(String.format("* FATAL: invalid server type '%s'", fds[1]));
 +            }
 +            LOG.info("    input-type: {}", input_type);
 +
 +          } else if (parameter.equals(normalize_key("server-type"))) {
 +            if (fds[1].toLowerCase().equals("tcp"))
 +              server_type = SERVER_TYPE.TCP;
 +            else if (fds[1].toLowerCase().equals("http"))
 +              server_type = SERVER_TYPE.HTTP;
 +
 +            LOG.info("    server-type: {}", server_type);
 +
 +          } else if (parameter.equals(normalize_key("server-port"))) {
 +            server_port = Integer.parseInt(fds[1]);
 +            LOG.info("    server-port: {}", server_port);
 +
 +          } else if (parameter.equals(normalize_key("rescore-forest"))) {
 +            rescoreForest = true;
 +            LOG.info("    rescore-forest: {}", rescoreForest);
 +
 +          } else if (parameter.equals(normalize_key("rescore-forest-weight"))) {
 +            rescoreForestWeight = Float.parseFloat(fds[1]);
 +            LOG.info("    rescore-forest-weight: {}", rescoreForestWeight);
 +
 +          } else if (parameter.equals(normalize_key("maxlen"))) {
 +            // reset the maximum length
 +            maxlen = Integer.parseInt(fds[1]);
 +
 +          } else if (parameter.equals("c") || parameter.equals("config")) {
 +            // this was used to send in the config file, just ignore it
 +            ;
 +
 +          } else if (parameter.equals(normalize_key("feature-function"))) {
 +            // add the feature to the list of features for later processing
 +            features.add("feature_function = " + fds[1]);
 +
 +          } else if (parameter.equals(normalize_key("maxlen"))) {
 +            // add the feature to the list of features for later processing
 +            maxlen = Integer.parseInt(fds[1]);
 +
 +          } else if (parameter
 +              .equals(normalize_key(SOFT_SYNTACTIC_CONSTRAINT_DECODING_PROPERTY_NAME))) {
 +            fuzzy_matching = Boolean.parseBoolean(fds[1]);
 +            LOG.debug("fuzzy_matching: {}", fuzzy_matching);
 +
 +          } else if (parameter.equals(normalize_key("fragment-map"))) {
 +            fragmentMapFile = fds[1];
 +            Tree.readMapping(fragmentMapFile);
 +
 +            /** PHRASE-BASED PARAMETERS **/
 +          } else if (parameter.equals(normalize_key("search"))) {
 +            search_algorithm = fds[1];
 +
 +            if (!search_algorithm.equals("cky") && !search_algorithm.equals("stack")) {
 +              throw new RuntimeException(
 +                  "-search must be one of 'stack' (for phrase-based decoding) " +
 +                      "or 'cky' (for hierarchical / syntactic decoding)");
 +            }
 +
 +            if (search_algorithm.equals("cky") && include_align_index) {
 +              throw new RuntimeException(
 +                  "include_align_index is currently not supported with cky search");
 +            }
 +
 +          } else if (parameter.equals(normalize_key("reordering-limit"))) {
 +            reordering_limit = Integer.parseInt(fds[1]);
 +
 +          } else if (parameter.equals(normalize_key("num-translation-options"))) {
 +            num_translation_options = Integer.parseInt(fds[1]);
 +
 +          } else if (parameter.equals(normalize_key("no-dot-chart"))) {
 +            use_dot_chart = false;
 +
 +          } else if (parameter.equals(normalize_key("moses"))) {
 +            moses = true; // triggers some Moses-specific compatibility options
 +
 +          } else if (parameter.equals(normalize_key("show-weights"))) {
 +            show_weights_and_quit = true;
 +
 +          } else if (parameter.equals(normalize_key("n-best-list"))) {
 +            // for Moses compatibility
 +            String[] tokens = fds[1].split("\\s+");
 +            n_best_file = tokens[0];
 +            if (tokens.length > 1)
 +              topN = Integer.parseInt(tokens[1]);
 +
 +          } else if (parameter.equals(normalize_key("input-file"))) {
 +            // for Moses compatibility
 +            input_file = fds[1];
 +
 +          } else if (parameter.equals(normalize_key("weight-file"))) {
 +            // for Moses, ignore
 +
 +          } else if (parameter.equals(normalize_key("weight-overwrite"))) {
 +            weight_overwrite = fds[1];
 +
 +          } else if (parameter.equals(normalize_key("source-annotations"))) {
 +            // Check source sentence
 +            source_annotations = true;
 +
 +          } else if (parameter.equals(normalize_key("cached-rules-size"))) {
 +            // Check source sentence
 +            cachedRuleSize = Integer.parseInt(fds[1]);
 +          } else if (parameter.equals(normalize_key("lowercase"))) {
 +            lowercase = true;
 +
 +          } else if (parameter.equals(normalize_key("project-case"))) {
 +            project_case = true;
 +
 +          } else {
 +
 +            if (parameter.equals(normalize_key("use-sent-specific-tm"))
 +                || parameter.equals(normalize_key("add-combined-cost"))
 +                || parameter.equals(normalize_key("use-tree-nbest"))
 +                || parameter.equals(normalize_key("use-kenlm"))
 +                || parameter.equals(normalize_key("useCubePrune"))
 +                || parameter.equals(normalize_key("useBeamAndThresholdPrune"))
 +                || parameter.equals(normalize_key("regexp-grammar"))) {
 +              LOG.warn("ignoring deprecated parameter '{}'", fds[0]);
 +
 +            } else {
 +              throw new RuntimeException("FATAL: unknown configuration parameter '" + fds[0] + "'");
 +            }
 +          }
 +
 +          LOG.info("    {} = '{}'", normalize_key(fds[0]), fds[1]);
 +
 +        } else {
 +          /*
 +           * Lines that don't have an equals sign and are not blank lines, empty lines, or comments,
 +           * are feature values, which can be present in this file
 +           */
 +
 +          weights.add(line);
 +        }
 +      }
 +    } finally {
 +      configReader.close();
 +    }
 +  }
 +
 +  /**
 +   * Checks for invalid variable configurations
 +   */
 +  public void sanityCheck() {
 +  }
 +
 +  /**
 +   * Normalizes parameter names by removing underscores and hyphens and lowercasing. This defines
 +   * equivalence classes on external use of parameter names, permitting arbitrary_under_scores and
 +   * camelCasing in paramter names without forcing the user to memorize them all. Here are some
 +   * examples of equivalent ways to refer to parameter names:
 +   * <pre>
 +   * {pop-limit, poplimit, PopLimit, popLimit, pop_lim_it} {lmfile, lm-file, LM-FILE, lm_file}
 +   * </pre>
 +   * 
 +   * @param text the string to be normalized
 +   * @return normalized key
 +   * 
 +   */
 +  public static String normalize_key(String text) {
 +    return text.replaceAll("[-_]", "").toLowerCase();
 +  }
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/abb8c518/src/main/java/org/apache/joshua/decoder/chart_parser/Chart.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/chart_parser/Chart.java
index 184ae27,0000000..7dd3be9
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/chart_parser/Chart.java
+++ b/src/main/java/org/apache/joshua/decoder/chart_parser/Chart.java
@@@ -1,742 -1,0 +1,742 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.decoder.chart_parser;
 +
 +import java.util.ArrayList;
 +import java.util.Arrays;
 +import java.util.HashSet;
 +import java.util.List;
 +import java.util.PriorityQueue;
 +
 +import org.apache.joshua.corpus.Vocabulary;
 +import org.apache.joshua.decoder.JoshuaConfiguration;
 +import org.apache.joshua.decoder.chart_parser.DotChart.DotNode;
 +import org.apache.joshua.decoder.ff.FeatureFunction;
 +import org.apache.joshua.decoder.ff.SourceDependentFF;
 +import org.apache.joshua.decoder.ff.tm.AbstractGrammar;
 +import org.apache.joshua.decoder.ff.tm.Grammar;
 +import org.apache.joshua.decoder.ff.tm.Rule;
 +import org.apache.joshua.decoder.ff.tm.RuleCollection;
 +import org.apache.joshua.decoder.ff.tm.Trie;
 +import org.apache.joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar;
 +import org.apache.joshua.decoder.hypergraph.HGNode;
 +import org.apache.joshua.decoder.hypergraph.HyperGraph;
 +import org.apache.joshua.decoder.segment_file.Sentence;
 +import org.apache.joshua.decoder.segment_file.Token;
 +import org.apache.joshua.lattice.Arc;
 +import org.apache.joshua.lattice.Lattice;
 +import org.apache.joshua.lattice.Node;
 +import org.apache.joshua.util.ChartSpan;
 +import org.slf4j.Logger;
 +import org.slf4j.LoggerFactory;
 +
 +/**
 + * Chart class this class implements chart-parsing: (1) seeding the chart (2)
 + * cky main loop over bins, (3) identify applicable rules in each bin
 + * 
 + * Note: the combination operation will be done in Cell
 + * 
 + * Signatures of class: Cell: i, j SuperNode (used for CKY check): i,j, lhs
 + * HGNode ("or" node): i,j, lhs, edge ngrams HyperEdge ("and" node)
 + * 
 + * index of sentences: start from zero index of cell: cell (i,j) represent span
 + * of words indexed [i,j-1] where i is in [0,n-1] and j is in [1,n]
 + * 
 + * @author Zhifei Li, zhifei.work@gmail.com
 + * @author Matt Post post@cs.jhu.edu
 + */
 +
 +public class Chart {
 +
 +  private static final Logger LOG = LoggerFactory.getLogger(Chart.class);
 +  private final JoshuaConfiguration config;
 +  // ===========================================================
 +  // Statistics
 +  // ===========================================================
 +
 +  /**
 +   * how many items have been pruned away because its cost is greater than the
 +   * cutoff in calling chart.add_deduction_in_chart()
 +   */
 +  int nMerged = 0;
 +  int nAdded = 0;
 +  int nDotitemAdded = 0; // note: there is no pruning in dot-item
 +
 +  public Sentence getSentence() {
 +    return this.sentence;
 +  }
 +  
 +  // ===============================================================
 +  // Private instance fields (maybe could be protected instead)
 +  // ===============================================================
 +  private ChartSpan<Cell> cells; // note that in some cell, it might be null
 +  private int sourceLength;
 +  private List<FeatureFunction> featureFunctions;
 +  private Grammar[] grammars;
 +  private DotChart[] dotcharts; // each grammar should have a dotchart associated with it
 +  private Cell goalBin;
 +  private int goalSymbolID = -1;
 +  private Lattice<Token> inputLattice;
 +
 +  private Sentence sentence = null;
 +//  private SyntaxTree parseTree;
 +//  private ManualConstraintsHandler manualConstraintsHandler;
 +  private StateConstraint stateConstraint;
 +
 +
 +  // ===============================================================
 +  // Constructors
 +  // ===============================================================
 +
 +  /*
 +   * TODO: Once the Segment interface is adjusted to provide a Lattice<String>
 +   * for the sentence() method, we should just accept a Segment instead of the
 +   * sentence, segmentID, and constraintSpans parameters. We have the symbol
 +   * table already, so we can do the integerization here instead of in
 +   * DecoderThread. GrammarFactory.getGrammarForSentence will want the
 +   * integerized sentence as well, but then we'll need to adjust that interface
 +   * to deal with (non-trivial) lattices too. Of course, we get passed the
 +   * grammars too so we could move all of that into here.
 +   */
 +
 +  public Chart(Sentence sentence, List<FeatureFunction> featureFunctions, Grammar[] grammars,
 +      String goalSymbol, JoshuaConfiguration config) {
 +    this.config = config;
 +    this.inputLattice = sentence.getLattice();
 +    this.sourceLength = inputLattice.size() - 1;
 +    this.featureFunctions = featureFunctions;
 +
 +    this.sentence = sentence;
 +
 +    // TODO: OOV handling no longer handles parse tree input (removed after
 +    // commit 748eb69714b26dd67cba8e7c25a294347603bede)
 +//    this.parseTree = null;
 +//    if (sentence instanceof ParsedSentence)
 +//      this.parseTree = ((ParsedSentence) sentence).syntaxTree();
 +//
 +    this.cells = new ChartSpan<Cell>(sourceLength, null);
 +
 +    this.goalSymbolID = Vocabulary.id(goalSymbol);
 +    this.goalBin = new Cell(this, this.goalSymbolID);
 +
 +    /* Create the grammars, leaving space for the OOV grammar. */
 +    this.grammars = new Grammar[grammars.length + 1];
 +    for (int i = 0; i < grammars.length; i++)
 +      this.grammars[i + 1] = grammars[i];
 +
 +    MemoryBasedBatchGrammar oovGrammar = new MemoryBasedBatchGrammar("oov", this.config);
 +    AbstractGrammar.addOOVRules(oovGrammar, sentence.getLattice(), featureFunctions,
 +        this.config.true_oovs_only);
 +    this.grammars[0] = oovGrammar;
 +
 +    // each grammar will have a dot chart
 +    this.dotcharts = new DotChart[this.grammars.length];
 +    for (int i = 0; i < this.grammars.length; i++)
 +      this.dotcharts[i] = new DotChart(this.inputLattice, this.grammars[i], this,
 +          this.grammars[i].isRegexpGrammar());
 +
 +    // Begin to do initialization work
 +
 +//    manualConstraintsHandler = new ManualConstraintsHandler(this, grammars[grammars.length - 1],
 +//        sentence.constraints());
 +
 +    stateConstraint = null;
 +    if (sentence.target() != null)
 +      // stateConstraint = new StateConstraint(sentence.target());
 +      stateConstraint = new StateConstraint(Vocabulary.START_SYM + " " + sentence.target() + " "
 +          + Vocabulary.STOP_SYM);
 +
 +    /* Find the SourceDependent feature and give it access to the sentence. */
 +    for (FeatureFunction ff : this.featureFunctions)
 +      if (ff instanceof SourceDependentFF)
 +        ((SourceDependentFF) ff).setSource(sentence);
 +
 +    LOG.debug("Finished seeding chart.");
 +  }
 +
 +  /**
 +   * Manually set the goal symbol ID. The constructor expects a String
 +   * representing the goal symbol, but there may be time (say, for example, in
 +   * the second pass of a synchronous parse) where we want to set the goal
 +   * symbol to a particular ID (regardless of String representation).
 +   * <p>
 +   * This method should be called before expanding the chart, as chart expansion
 +   * depends on the goal symbol ID.
 +   * 
 +   * @param i the id of the goal symbol to use
 +   */
 +  public void setGoalSymbolID(int i) {
 +    this.goalSymbolID = i;
 +    this.goalBin = new Cell(this, i);
 +    return;
 +  }
 +
 +  // ===============================================================
 +  // The primary method for filling in the chart
 +  // ===============================================================
 +
 +  /**
 +   * Construct the hypergraph with the help from DotChart using cube pruning.
 +   * Cube pruning occurs at the span level, with all completed rules from the
 +   * dot chart competing against each other; that is, rules with different
 +   * source sides *and* rules sharing a source side but with different target
 +   * sides are all in competition with each other.
 +   * 
 +   * Terminal rules are added to the chart directly.
 +   * 
 +   * Rules with nonterminals are added to the list of candidates. The candidates
 +   * list is seeded with the list of all rules and, for each nonterminal in the
 +   * rule, the 1-best tail node for that nonterminal and subspan. If the maximum
 +   * arity of a rule is R, then the dimension of the hypercube is R + 1, since
 +   * the first dimension is used to record the rule.
 +   */
 +  private void completeSpan(int i, int j) {
 +
 +    /* STEP 1: create the heap, and seed it with all of the candidate states */
 +    PriorityQueue<CubePruneState> candidates = new PriorityQueue<CubePruneState>();
 +
 +    /*
 +     * Look at all the grammars, seeding the chart with completed rules from the
 +     * DotChart
 +     */
 +    for (int g = 0; g < grammars.length; g++) {
 +      if (!grammars[g].hasRuleForSpan(i, j, inputLattice.distance(i, j))
 +          || null == dotcharts[g].getDotCell(i, j))
 +        continue;
 +
 +      // for each rule with applicable rules
 +      for (DotNode dotNode : dotcharts[g].getDotCell(i, j).getDotNodes()) {
 +        RuleCollection ruleCollection = dotNode.getRuleCollection();
 +        if (ruleCollection == null)
 +          continue;
 +
 +        List<Rule> rules = ruleCollection.getSortedRules(this.featureFunctions);
 +        SourcePath sourcePath = dotNode.getSourcePath();
 +
 +        if (null == rules || rules.size() == 0)
 +          continue;
 +
 +        if (ruleCollection.getArity() == 0) {
 +          /*
 +           * The total number of arity-0 items (pre-terminal rules) that we add
 +           * is controlled by num_translation_options in the configuration.
 +           * 
 +           * We limit the translation options per DotNode; that is, per LHS.
 +           */
 +          int numTranslationsAdded = 0;
 +
 +          /* Terminal productions are added directly to the chart */
 +          for (Rule rule : rules) {
 +
 +            if (config.num_translation_options > 0
 +                && numTranslationsAdded >= config.num_translation_options) {
 +              break;
 +            }
 +
 +            ComputeNodeResult result = new ComputeNodeResult(this.featureFunctions, rule, null, i,
 +                j, sourcePath, this.sentence);
 +
 +            if (stateConstraint == null || stateConstraint.isLegal(result.getDPStates())) {
 +              getCell(i, j).addHyperEdgeInCell(result, rule, i, j, null, sourcePath, true);
 +              numTranslationsAdded++;
 +            }
 +          }
 +        } else {
 +          /* Productions with rank > 0 are subject to cube pruning */
 +
 +          Rule bestRule = rules.get(0);
 +
 +          List<HGNode> currentTailNodes = new ArrayList<HGNode>();
 +          List<SuperNode> superNodes = dotNode.getAntSuperNodes();
 +          for (SuperNode si : superNodes) {
 +            currentTailNodes.add(si.nodes.get(0));
 +          }
 +
 +          /*
 +           * `ranks` records the current position in the cube. the 0th index is
 +           * the rule, and the remaining indices 1..N correspond to the tail
 +           * nodes (= nonterminals in the rule). These tail nodes are
 +           * represented by SuperNodes, which group together items with the same
 +           * nonterminal but different DP state (e.g., language model state)
 +           */
 +          int[] ranks = new int[1 + superNodes.size()];
 +          Arrays.fill(ranks, 1);
 +
 +          ComputeNodeResult result = new ComputeNodeResult(featureFunctions, bestRule,
 +              currentTailNodes, i, j, sourcePath, sentence);
 +          CubePruneState bestState = new CubePruneState(result, ranks, rules, currentTailNodes,
 +              dotNode);
 +          candidates.add(bestState);
 +        }
 +      }
 +    }
 +
 +    applyCubePruning(i, j, candidates);
 +  }
 +
 +  /**
 +   * Applies cube pruning over a span.
 +   * 
 +   * @param i
 +   * @param j
 +   * @param stateConstraint
 +   * @param candidates
 +   */
 +  private void applyCubePruning(int i, int j, PriorityQueue<CubePruneState> candidates) {
 +
 +    // System.err.println(String.format("CUBEPRUNE: %d-%d with %d candidates",
 +    // i, j, candidates.size()));
 +    // for (CubePruneState cand: candidates) {
 +    // System.err.println(String.format("  CAND " + cand));
 +    // }
 +
 +    /*
 +     * There are multiple ways to reach each point in the cube, so short-circuit
 +     * that.
 +     */
 +    HashSet<CubePruneState> visitedStates = new HashSet<CubePruneState>();
 +
 +    int popLimit = config.pop_limit;
 +    int popCount = 0;
 +    while (candidates.size() > 0 && ((++popCount <= popLimit) || popLimit == 0)) {
 +      CubePruneState state = candidates.poll();
 +
 +      DotNode dotNode = state.getDotNode();
 +      List<Rule> rules = state.rules;
 +      SourcePath sourcePath = dotNode.getSourcePath();
 +      List<SuperNode> superNodes = dotNode.getAntSuperNodes();
 +
 +      /*
 +       * Add the hypothesis to the chart. This can only happen if (a) we're not
 +       * doing constrained decoding or (b) we are and the state is legal.
 +       */
 +      if (stateConstraint == null || stateConstraint.isLegal(state.getDPStates())) {
 +        getCell(i, j).addHyperEdgeInCell(state.computeNodeResult, state.getRule(), i, j,
 +            state.antNodes, sourcePath, true);
 +      }
 +
 +      /*
 +       * Expand the hypothesis by walking down a step along each dimension of
 +       * the cube, in turn. k = 0 means we extend the rule being used; k > 0
 +       * expands the corresponding tail node.
 +       */
 +
 +      for (int k = 0; k < state.ranks.length; k++) {
 +
 +        /* Copy the current ranks, then extend the one we're looking at. */
 +        int[] nextRanks = new int[state.ranks.length];
 +        System.arraycopy(state.ranks, 0, nextRanks, 0, state.ranks.length);
 +        nextRanks[k]++;
 +
 +        /*
 +         * We might have reached the end of something (list of rules or tail
 +         * nodes)
 +         */
 +        if (k == 0
 +            && (nextRanks[k] > rules.size() || (config.num_translation_options > 0 && nextRanks[k] > config.num_translation_options)))
 +          continue;
 +        else if ((k != 0 && nextRanks[k] > superNodes.get(k - 1).nodes.size()))
 +          continue;
 +
 +        /* Use the updated ranks to assign the next rule and tail node. */
 +        Rule nextRule = rules.get(nextRanks[0] - 1);
 +        // HGNode[] nextAntNodes = new HGNode[state.antNodes.size()];
-         List<HGNode> nextAntNodes = new ArrayList<HGNode>();
++        List<HGNode> nextAntNodes = new ArrayList<HGNode>(state.antNodes.size());
 +        for (int x = 0; x < state.ranks.length - 1; x++)
 +          nextAntNodes.add(superNodes.get(x).nodes.get(nextRanks[x + 1] - 1));
 +
 +        /* Create the next state. */
 +        CubePruneState nextState = new CubePruneState(new ComputeNodeResult(featureFunctions,
 +            nextRule, nextAntNodes, i, j, sourcePath, this.sentence), nextRanks, rules,
 +            nextAntNodes, dotNode);
 +
 +        /* Skip states that have been explored before. */
 +        if (visitedStates.contains(nextState))
 +          continue;
 +
 +        visitedStates.add(nextState);
 +        candidates.add(nextState);
 +      }
 +    }
 +  }
 +
 +  /* Create a priority queue of candidates for each span under consideration */
 +  private PriorityQueue<CubePruneState>[] allCandidates;
 +
 +  private ArrayList<SuperNode> nodeStack;
 +
 +  /**
 +   * Translates the sentence using the CKY+ variation proposed in
 +   * "A CYK+ Variant for SCFG Decoding Without A Dot Chart" (Sennrich, SSST
 +   * 2014).
 +   */
 +  private int i = -1;
 +
 +  public HyperGraph expandSansDotChart() {
 +    for (i = sourceLength - 1; i >= 0; i--) {
 +      allCandidates = new PriorityQueue[sourceLength - i + 2];
 +      for (int id = 0; id < allCandidates.length; id++)
 +        allCandidates[id] = new PriorityQueue<CubePruneState>();
 +
 +      nodeStack = new ArrayList<SuperNode>();
 +
 +      for (int j = i + 1; j <= sourceLength; j++) {
 +        if (!sentence.hasPath(i, j))
 +          continue;
 +
 +        for (int g = 0; g < this.grammars.length; g++) {
 +          // System.err.println(String.format("\n*** I=%d J=%d GRAMMAR=%d", i, j, g));
 +
 +          if (j == i + 1) {
 +            /* Handle terminals */
 +            Node<Token> node = sentence.getNode(i);
 +            for (Arc<Token> arc : node.getOutgoingArcs()) {
 +              int word = arc.getLabel().getWord();
 +              // disallow lattice decoding for now
 +              assert arc.getHead().id() == j;
 +              Trie trie = this.grammars[g].getTrieRoot().match(word);
 +              if (trie != null && trie.hasRules())
 +                addToChart(trie, j, false);
 +            }
 +          } else {
 +            /* Recurse for non-terminal case */
 +            consume(this.grammars[g].getTrieRoot(), i, j - 1);
 +          }
 +        }
 +
 +        // Now that we've accumulated all the candidates, apply cube pruning
 +        applyCubePruning(i, j, allCandidates[j - i]);
 +
 +        // Add unary nodes
 +        addUnaryNodes(this.grammars, i, j);
 +      }
 +    }
 +
 +    // transition_final: setup a goal item, which may have many deductions
 +    if (null == this.cells.get(0, sourceLength)
 +        || !this.goalBin.transitToGoal(this.cells.get(0, sourceLength), this.featureFunctions,
 +            this.sourceLength)) {
 +      LOG.info("Input {}: Parse failure (either no derivations exist or pruning is too aggressive",
 +          sentence.id());
 +      return null;
 +    }
 +
 +    return new HyperGraph(this.goalBin.getSortedNodes().get(0), -1, -1, this.sentence);
 +  }
 +
 +  /**
 +   * Recursively consumes the trie, following input nodes, finding applicable
 +   * rules and adding them to bins for each span for later cube pruning.
 +   * 
 +   * @param dotNode data structure containing information about what's been
 +   *          already matched
 +   * @param l extension point we're looking at
 +   * 
 +   */
 +  private void consume(Trie trie, int j, int l) {
 +    /*
 +     * 1. If the trie node has any rules, we can add them to the collection
 +     * 
 +     * 2. Next, look at all the outgoing nonterminal arcs of the trie node. If
 +     * any of them match an existing chart item, then we know we can extend
 +     * (i,j) to (i,l). We then recurse for all m from l+1 to n (the end of the
 +     * sentence)
 +     * 
 +     * 3. We also try to match terminals if (j + 1 == l)
 +     */
 +
 +    // System.err.println(String.format("CONSUME %s / %d %d %d", dotNode,
 +    // dotNode.begin(), dotNode.end(), l));
 +
 +    // Try to match terminals
 +    if (inputLattice.distance(j, l) == 1) {
 +      // Get the current sentence node, and explore all outgoing arcs, since we
 +      // might be decoding
 +      // a lattice. For sentence decoding, this is trivial: there is only one
 +      // outgoing arc.
 +      Node<Token> inputNode = sentence.getNode(j);
 +      for (Arc<Token> arc : inputNode.getOutgoingArcs()) {
 +        int word = arc.getLabel().getWord();
 +        Trie nextTrie;
 +        if ((nextTrie = trie.match(word)) != null) {
 +          // add to chart item over (i, l)
 +          addToChart(nextTrie, arc.getHead().id(), i == j);
 +        }
 +      }
 +    }
 +
 +    // Now try to match nonterminals
 +    Cell cell = cells.get(j, l);
 +    if (cell != null) {
 +      for (int id : cell.getKeySet()) { // for each supernode (lhs), see if you
 +                                        // can match a trie
 +        Trie nextTrie = trie.match(id);
 +        if (nextTrie != null) {
 +          SuperNode superNode = cell.getSuperNode(id);
 +          nodeStack.add(superNode);
 +          addToChart(nextTrie, superNode.end(), i == j);
 +          nodeStack.remove(nodeStack.size() - 1);
 +        }
 +      }
 +    }
 +  }
 +
 +  /**
 +   * Adds all rules at a trie node to the chart, unless its a unary rule. A
 +   * unary rule is the first outgoing arc of a grammar's root trie. For
 +   * terminals, these are added during the seeding stage; for nonterminals,
 +   * these confuse cube pruning and can result in infinite loops, and are
 +   * handled separately (see addUnaryNodes());
 +   * 
 +   * @param trie the grammar node
 +   * @param isUnary whether the rules at this dotnode are unary
 +   */
 +  private void addToChart(Trie trie, int j, boolean isUnary) {
 +
 +    // System.err.println(String.format("ADD TO CHART %s unary=%s", dotNode,
 +    // isUnary));
 +
 +    if (!isUnary && trie.hasRules()) {
 +      DotNode dotNode = new DotNode(i, j, trie, new ArrayList<SuperNode>(nodeStack), null);
 +
 +      addToCandidates(dotNode);
 +    }
 +
 +    for (int l = j + 1; l <= sentence.length(); l++)
 +      consume(trie, j, l);
 +  }
 +
 +  /**
 +   * Record the completed rule with backpointers for later cube-pruning.
 +   * 
 +   * @param width
 +   * @param rules
 +   * @param tailNodes
 +   */
 +  private void addToCandidates(DotNode dotNode) {
 +    // System.err.println(String.format("ADD TO CANDIDATES %s AT INDEX %d",
 +    // dotNode, dotNode.end() - dotNode.begin()));
 +
 +    // TODO: one entry per rule, or per rule instantiation (rule together with
 +    // unique matching of input)?
 +    List<Rule> rules = dotNode.getRuleCollection().getSortedRules(featureFunctions);
 +    Rule bestRule = rules.get(0);
 +    List<SuperNode> superNodes = dotNode.getAntSuperNodes();
 +
 +    List<HGNode> tailNodes = new ArrayList<HGNode>();
 +    for (SuperNode superNode : superNodes)
 +      tailNodes.add(superNode.nodes.get(0));
 +
 +    int[] ranks = new int[1 + superNodes.size()];
 +    Arrays.fill(ranks, 1);
 +
 +    ComputeNodeResult result = new ComputeNodeResult(featureFunctions, bestRule, tailNodes,
 +        dotNode.begin(), dotNode.end(), dotNode.getSourcePath(), sentence);
 +    CubePruneState seedState = new CubePruneState(result, ranks, rules, tailNodes, dotNode);
 +
 +    allCandidates[dotNode.end() - dotNode.begin()].add(seedState);
 +  }
 +
 +  /**
 +   * This function performs the main work of decoding.
 +   * 
 +   * @return the hypergraph containing the translated sentence.
 +   */
 +  public HyperGraph expand() {
 +
 +    for (int width = 1; width <= sourceLength; width++) {
 +      for (int i = 0; i <= sourceLength - width; i++) {
 +        int j = i + width;
 +        LOG.debug("Processing span ({}, {})", i, j);
 +
 +        /* Skips spans for which no path exists (possible in lattices). */
 +        if (inputLattice.distance(i, j) == Float.POSITIVE_INFINITY) {
 +          continue;
 +        }
 +
 +        /*
 +         * 1. Expand the dot through all rules. This is a matter of (a) look for
 +         * rules over (i,j-1) that need the terminal at (j-1,j) and looking at
 +         * all split points k to expand nonterminals.
 +         */
 +        LOG.debug("Expanding cell");
 +        for (int k = 0; k < this.grammars.length; k++) {
 +          /**
 +           * Each dotChart can act individually (without consulting other
 +           * dotCharts) because it either consumes the source input or the
 +           * complete nonTerminals, which are both grammar-independent.
 +           **/
 +          this.dotcharts[k].expandDotCell(i, j);
 +        }
 +
 +        /*
 +         * 2. The regular CKY part: add completed items onto the chart via cube
 +         * pruning.
 +         */
 +        LOG.debug("Adding complete items into chart");
 +        completeSpan(i, j);
 +
 +        /* 3. Process unary rules. */
 +        LOG.debug("Adding unary items into chart");
 +        addUnaryNodes(this.grammars, i, j);
 +
 +        // (4)=== in dot_cell(i,j), add dot-nodes that start from the /complete/
 +        // superIterms in
 +        // chart_cell(i,j)
 +        LOG.debug("Initializing new dot-items that start from complete items in this cell");
 +        for (int k = 0; k < this.grammars.length; k++) {
 +          if (this.grammars[k].hasRuleForSpan(i, j, inputLattice.distance(i, j))) {
 +            this.dotcharts[k].startDotItems(i, j);
 +          }
 +        }
 +
 +        /*
 +         * 5. Sort the nodes in the cell.
 +         * 
 +         * Sort the nodes in this span, to make them usable for future
 +         * applications of cube pruning.
 +         */
 +        if (null != this.cells.get(i, j)) {
 +          this.cells.get(i, j).getSortedNodes();
 +        }
 +      }
 +    }
 +
 +    logStatistics();
 +
 +    // transition_final: setup a goal item, which may have many deductions
 +    if (null == this.cells.get(0, sourceLength)
 +        || !this.goalBin.transitToGoal(this.cells.get(0, sourceLength), this.featureFunctions,
 +            this.sourceLength)) {
 +      LOG.info("Input {}: Parse failure (either no derivations exist or pruning is too aggressive",
 +          sentence.id());
 +      return null;
 +    }
 +
 +    LOG.debug("Finished expand");
 +    return new HyperGraph(this.goalBin.getSortedNodes().get(0), -1, -1, this.sentence);
 +  }
 +
 +  /**
 +   * Get the requested cell, creating the entry if it doesn't already exist.
 +   * 
 +   * @param i span start
 +   * @param j span end
 +   * @return the cell item
 +   */
 +  public Cell getCell(int i, int j) {
 +    assert i >= 0;
 +    assert i <= sentence.length();
 +    assert i <= j;
 +    if (cells.get(i, j) == null)
 +      cells.set(i, j, new Cell(this, goalSymbolID));
 +
 +    return cells.get(i, j);
 +  }
 +
 +  // ===============================================================
 +  // Private methods
 +  // ===============================================================
 +
 +  private void logStatistics() {
 +    LOG.info("Input {}: Chart: added {} merged {} dot-items added: {}",
 +        this.sentence.id(), this.nAdded, this.nMerged, this.nDotitemAdded);
 +  }
 +
 +  /**
 +   * Handles expansion of unary rules. Rules are expanded in an agenda-based
 +   * manner to avoid constructing infinite unary chains. Assumes a triangle
 +   * inequality of unary rule expansion (e.g., A -> B will always be cheaper
 +   * than A -> C -> B), which is not a true assumption.
 +   * 
 +   * @param grammars A list of the grammars for the sentence
 +   * @param i
 +   * @param j
 +   * @return the number of nodes added
 +   */
 +  private int addUnaryNodes(Grammar[] grammars, int i, int j) {
 +
 +    Cell chartBin = this.cells.get(i, j);
 +    if (null == chartBin) {
 +      return 0;
 +    }
 +    int qtyAdditionsToQueue = 0;
 +    ArrayList<HGNode> queue = new ArrayList<HGNode>(chartBin.getSortedNodes());
 +    HashSet<Integer> seen_lhs = new HashSet<Integer>();
 +
 +    LOG.debug("Adding unary to [{}, {}]", i, j);
 +
 +    while (queue.size() > 0) {
 +      HGNode node = queue.remove(0);
 +      seen_lhs.add(node.lhs);
 +
 +      for (Grammar gr : grammars) {
 +        if (!gr.hasRuleForSpan(i, j, inputLattice.distance(i, j)))
 +          continue;
 +
 +        /*
 +         * Match against the node's LHS, and then make sure the rule collection
 +         * has unary rules
 +         */
 +        Trie childNode = gr.getTrieRoot().match(node.lhs);
 +        if (childNode != null && childNode.getRuleCollection() != null
 +            && childNode.getRuleCollection().getArity() == 1) {
 +
 +          ArrayList<HGNode> antecedents = new ArrayList<HGNode>();
 +          antecedents.add(node);
 +
 +          List<Rule> rules = childNode.getRuleCollection().getSortedRules(this.featureFunctions);
 +          for (Rule rule : rules) { // for each unary rules
 +
 +            ComputeNodeResult states = new ComputeNodeResult(this.featureFunctions, rule,
 +                antecedents, i, j, new SourcePath(), this.sentence);
 +            HGNode resNode = chartBin.addHyperEdgeInCell(states, rule, i, j, antecedents,
 +                new SourcePath(), true);
 +
 +            LOG.debug("{}", rule);
 +            if (null != resNode && !seen_lhs.contains(resNode.lhs)) {
 +              queue.add(resNode);
 +              qtyAdditionsToQueue++;
 +            }
 +          }
 +        }
 +      }
 +    }
 +    return qtyAdditionsToQueue;
 +  }
 +
 +  /***
 +   * Add a terminal production (X -&gt; english phrase) to the hypergraph.
 +   * 
 +   * @param i the start index
 +   * @param j stop index
 +   * @param rule the terminal rule applied
 +   * @param srcPath the source path cost
 +   */
 +  public void addAxiom(int i, int j, Rule rule, SourcePath srcPath) {
 +    if (null == this.cells.get(i, j)) {
 +      this.cells.set(i, j, new Cell(this, this.goalSymbolID));
 +    }
 +
 +    this.cells.get(i, j).addHyperEdgeInCell(
 +        new ComputeNodeResult(this.featureFunctions, rule, null, i, j, srcPath, sentence), rule, i,
 +        j, null, srcPath, false);
 +
 +  }
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/abb8c518/src/main/java/org/apache/joshua/decoder/chart_parser/CubePruneState.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/decoder/chart_parser/CubePruneState.java
index 7c2fe5c,0000000..d57a6a2
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/decoder/chart_parser/CubePruneState.java
+++ b/src/main/java/org/apache/joshua/decoder/chart_parser/CubePruneState.java
@@@ -1,116 -1,0 +1,114 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.decoder.chart_parser;
 +
- import java.util.ArrayList;
 +import java.util.Arrays;
 +import java.util.List;
 +
 +import org.apache.joshua.decoder.hypergraph.HGNode;
 +import org.apache.joshua.decoder.chart_parser.DotChart.DotNode;
 +import org.apache.joshua.decoder.ff.state_maintenance.DPState;
 +import org.apache.joshua.decoder.ff.tm.Rule;
 +
 +// ===============================================================
 +// CubePruneState class
 +// ===============================================================
 +public class CubePruneState implements Comparable<CubePruneState> {
 +  int[] ranks;
 +  ComputeNodeResult computeNodeResult;
 +  List<HGNode> antNodes;
 +  List<Rule> rules;
 +  private DotNode dotNode;
 +
 +  public CubePruneState(ComputeNodeResult score, int[] ranks, List<Rule> rules, List<HGNode> antecedents, DotNode dotNode) {
 +    this.computeNodeResult = score;
 +    this.ranks = ranks;
 +    this.rules = rules;
-     // create a new vector is critical, because currentAntecedents will change later
-     this.antNodes = new ArrayList<HGNode>(antecedents);
++    this.antNodes = antecedents;
 +    this.dotNode = dotNode;
 +  }
 +
 +  /**
 +   * This returns the list of DP states associated with the result.
 +   * 
 +   * @return
 +   */
 +  List<DPState> getDPStates() {
 +    return this.computeNodeResult.getDPStates();
 +  }
 +  
 +  Rule getRule() {
 +    return this.rules.get(this.ranks[0]-1);
 +  }
 +
 +  public String toString() {
 +    StringBuilder sb = new StringBuilder();
 +    sb.append("STATE ||| rule=" + getRule() + " inside cost = " + computeNodeResult.getViterbiCost()
 +        + " estimate = " + computeNodeResult.getPruningEstimate());
 +    return sb.toString();
 +  }
 +
 +  public void setDotNode(DotNode node) {
 +    this.dotNode = node;
 +  }
 +
 +  public DotNode getDotNode() {
 +    return this.dotNode;
 +  }
 +
 +  public boolean equals(Object obj) {
 +    if (obj == null)
 +      return false;
 +    if (!this.getClass().equals(obj.getClass()))
 +      return false;
 +    CubePruneState state = (CubePruneState) obj;
 +    if (state.ranks.length != ranks.length)
 +      return false;
 +    for (int i = 0; i < ranks.length; i++)
 +      if (state.ranks[i] != ranks[i])
 +        return false;
 +    if (getDotNode() != state.getDotNode())
 +      return false;
 +
 +    return true;
 +  }
 +
 +  public int hashCode() {
 +    int hash = (dotNode != null) ? dotNode.hashCode() : 0;
 +    hash += Arrays.hashCode(ranks);
 +
 +    return hash;
 +  }
 +
 +  /**
 +   * Compares states by ExpectedTotalLogP, allowing states to be sorted according to their inverse
 +   * order (high-prob first).
 +   */
 +  public int compareTo(CubePruneState another) {
 +    if (this.computeNodeResult.getPruningEstimate() < another.computeNodeResult
 +        .getPruningEstimate()) {
 +      return 1;
 +    } else if (this.computeNodeResult.getPruningEstimate() == another.computeNodeResult
 +        .getPruningEstimate()) {
 +      return 0;
 +    } else {
 +      return -1;
 +    }
 +  }
 +}