You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/05/31 16:37:07 UTC

[11/13] incubator-joshua git commit: Merge remote-tracking branch 'origin/master' into JOSHUA-252

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/src/main/java/org/apache/joshua/tools/GrammarPacker.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/tools/GrammarPacker.java
index 3b38c29,0000000..93aec2f
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/tools/GrammarPacker.java
+++ b/src/main/java/org/apache/joshua/tools/GrammarPacker.java
@@@ -1,958 -1,0 +1,959 @@@
 +/**
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.tools;
 +
 +import static org.apache.joshua.decoder.ff.tm.packed.PackedGrammar.VOCABULARY_FILENAME;
 +
 +import java.io.BufferedOutputStream;
 +import java.io.DataOutputStream;
 +import java.io.File;
 +import java.io.FileOutputStream;
 +import java.io.FileWriter;
 +import java.io.IOException;
 +import java.io.PrintWriter;
 +import java.nio.ByteBuffer;
 +import java.util.ArrayList;
 +import java.util.LinkedList;
 +import java.util.List;
 +import java.util.Queue;
 +import java.util.TreeMap;
 +
 +import org.apache.joshua.corpus.Vocabulary;
 +import org.apache.joshua.decoder.ff.tm.Rule;
 +import org.apache.joshua.decoder.ff.tm.format.HieroFormatReader;
 +import org.apache.joshua.decoder.ff.tm.format.MosesFormatReader;
 +import org.apache.joshua.util.FormatUtils;
 +import org.apache.joshua.util.encoding.EncoderConfiguration;
 +import org.apache.joshua.util.encoding.FeatureTypeAnalyzer;
 +import org.apache.joshua.util.encoding.IntEncoder;
 +import org.apache.joshua.util.io.LineReader;
 +import org.slf4j.Logger;
 +import org.slf4j.LoggerFactory;
 +
 +
 +public class GrammarPacker {
 +
 +  private static final Logger LOG = LoggerFactory.getLogger(GrammarPacker.class);
 +
 +  /**
 +   * The packed grammar version number. Increment this any time you add new features, and update
 +   * the documentation.
 +   * 
 +   * Version history:
 +   * 
 +   * - 3 (May 2016). This was the first version that was marked. It removed the special phrase-
 +   * table packing that packed phrases without the [X,1] on the source and target sides, which
 +   * then required special handling in the decoder to use for phrase-based decoding.
 +   * 
 +   * 
 +   */
 +  public static final int VERSION = 3;
 +  
 +  // Size limit for slice in bytes.
 +  private static int DATA_SIZE_LIMIT = (int) (Integer.MAX_VALUE * 0.8);
 +  // Estimated average number of feature entries for one rule.
 +  private static int DATA_SIZE_ESTIMATE = 20;
 +
 +  private static final String SOURCE_WORDS_SEPARATOR = " ||| ";
 +
 +  // Output directory name.
 +  private String output;
 +
 +  // Input grammar to be packed.
 +  private String grammar;
 +
 +  public String getGrammar() {
 +    return grammar;
 +  }
 +
 +  public String getOutputDirectory() {
 +    return output;
 +  }
 +
 +  // Approximate maximum size of a slice in number of rules
 +  private int approximateMaximumSliceSize;
 +
 +  private boolean labeled;
 +
 +  private boolean packAlignments;
 +  private boolean grammarAlignments;
 +  private String alignments;
 +
 +  private FeatureTypeAnalyzer types;
 +  private EncoderConfiguration encoderConfig;
 +
 +  private String dump;
 +
 +  private int max_source_len;
 +
 +  public GrammarPacker(String grammar_filename, String config_filename, String output_filename,
 +      String alignments_filename, String featuredump_filename, boolean grammar_alignments,
 +      int approximateMaximumSliceSize)
 +      throws IOException {
 +    this.labeled = true;
 +    this.grammar = grammar_filename;
 +    this.output = output_filename;
 +    this.dump = featuredump_filename;
 +    this.grammarAlignments = grammar_alignments;
 +    this.approximateMaximumSliceSize = approximateMaximumSliceSize;
 +    this.max_source_len = 0;
 +
 +    // TODO: Always open encoder config? This is debatable.
 +    this.types = new FeatureTypeAnalyzer(true);
 +
 +    this.alignments = alignments_filename;
 +    packAlignments = grammarAlignments || (alignments != null);
 +    if (!packAlignments) {
 +      LOG.info("No alignments file or grammar specified, skipping.");
 +    } else if (alignments != null && !new File(alignments_filename).exists()) {
 +      throw new RuntimeException("Alignments file does not exist: " + alignments);
 +    }
 +
 +    if (config_filename != null) {
 +      readConfig(config_filename);
 +      types.readConfig(config_filename);
 +    } else {
 +      LOG.info("No config specified. Attempting auto-detection of feature types.");
 +    }
 +    LOG.info("Approximate maximum slice size (in # of rules) set to {}", approximateMaximumSliceSize);
 +
 +    File working_dir = new File(output);
 +    working_dir.mkdir();
 +    if (!working_dir.exists()) {
 +      throw new RuntimeException("Failed creating output directory.");
 +    }
 +  }
 +
 +  private void readConfig(String config_filename) throws IOException {
 +    LineReader reader = new LineReader(config_filename);
 +    while (reader.hasNext()) {
 +      // Clean up line, chop comments off and skip if the result is empty.
 +      String line = reader.next().trim();
 +      if (line.indexOf('#') != -1)
 +        line = line.substring(0, line.indexOf('#'));
 +      if (line.isEmpty())
 +        continue;
 +      String[] fields = line.split("[\\s]+");
 +
 +      if (fields.length < 2) {
 +        throw new RuntimeException("Incomplete line in config.");
 +      }
 +      if ("slice_size".equals(fields[0])) {
 +        // Number of records to concurrently load into memory for sorting.
 +        approximateMaximumSliceSize = Integer.parseInt(fields[1]);
 +      }
 +    }
 +    reader.close();
 +  }
 +
 +  /**
 +   * Executes the packing.
 +   * 
 +   * @throws IOException if there is an error reading the grammar
 +   */
 +  public void pack() throws IOException {
 +    LOG.info("Beginning exploration pass.");
 +
 +    // Explore pass. Learn vocabulary and feature value histograms.
 +    LOG.info("Exploring: {}", grammar);
 +
 +    HieroFormatReader grammarReader = getGrammarReader();
 +    explore(grammarReader);
 +
 +    LOG.info("Exploration pass complete. Freezing vocabulary and finalizing encoders.");
 +    if (dump != null) {
 +      PrintWriter dump_writer = new PrintWriter(dump);
 +      dump_writer.println(types.toString());
 +      dump_writer.close();
 +    }
 +
 +    types.inferTypes(this.labeled);
 +    LOG.info("Type inference complete.");
 +
 +    LOG.info("Finalizing encoding.");
 +
 +    LOG.info("Writing encoding.");
 +    types.write(output + File.separator + "encoding");
 +
 +    writeVocabulary();
 +
 +    String configFile = output + File.separator + "config";
 +    LOG.info("Writing config to '{}'", configFile);
 +    // Write config options
 +    FileWriter config = new FileWriter(configFile);
 +    config.write(String.format("version = %d\n", VERSION));
 +    config.write(String.format("max-source-len = %d\n", max_source_len));
 +    config.close();
 +
 +    // Read previously written encoder configuration to match up to changed
 +    // vocabulary id's.
 +    LOG.info("Reading encoding.");
 +    encoderConfig = new EncoderConfiguration();
 +    encoderConfig.load(output + File.separator + "encoding");
 +
 +    LOG.info("Beginning packing pass.");
 +    // Actual binarization pass. Slice and pack source, target and data.
 +    grammarReader = getGrammarReader();
 +    LineReader alignment_reader = null;
 +    if (packAlignments && !grammarAlignments)
 +      alignment_reader = new LineReader(alignments);
 +    binarize(grammarReader, alignment_reader);
 +    LOG.info("Packing complete.");
 +
 +    LOG.info("Packed grammar in: {}", output);
 +    LOG.info("Done.");
 +  }
 +
 +  /**
 +   * Returns a reader that turns whatever file format is found into Hiero grammar rules.
 +   * 
 +   * @param grammarFile
 +   * @return
 +   * @throws IOException
 +   */
 +  private HieroFormatReader getGrammarReader() throws IOException {
 +    LineReader reader = new LineReader(grammar);
 +    String line = reader.next();
 +    if (line.startsWith("[")) {
 +      return new HieroFormatReader(grammar);
 +    } else {
 +      return new MosesFormatReader(grammar);
 +    }
 +  }
 +
 +  /**
 +   * This first pass over the grammar 
 +   * @param reader
 +   */
 +  private void explore(HieroFormatReader reader) {
 +
 +    // We always assume a labeled grammar. Unlabeled features are assumed to be dense and to always
 +    // appear in the same order. They are assigned numeric names in order of appearance.
 +    this.types.setLabeled(true);
 +
 +    for (Rule rule: reader) {
 +
 +      max_source_len = Math.max(max_source_len, rule.getFrench().length);
 +
 +      /* Add symbols to vocabulary.
 +       * NOTE: In case of nonterminals, we add both stripped versions ("[X]")
 +       * and "[X,1]" to the vocabulary.
 +       * 
-        * TODO: MJP May 2016: Is it necessary to add [X,1]?
++       * TODO: MJP May 2016: Is it necessary to add [X,1]? This is currently being done in
++       * {@link HieroFormatReader}, which is called by {@link MosesFormatReader}. 
 +       */
 +
 +      // Add feature names to vocabulary and pass the value through the
 +      // appropriate encoder.
 +      int feature_counter = 0;
 +      String[] features = rule.getFeatureString().split("\\s+");
 +      for (int f = 0; f < features.length; ++f) {
 +        if (features[f].contains("=")) {
 +          String[] fe = features[f].split("=");
 +          if (fe[0].equals("Alignment"))
 +            continue;
 +          types.observe(Vocabulary.id(fe[0]), Float.parseFloat(fe[1]));
 +        } else {
 +          types.observe(Vocabulary.id(String.valueOf(feature_counter++)),
 +              Float.parseFloat(features[f]));
 +        }
 +      }
 +    }
 +  }
 +
 +  /**
 +   * Returns a String encoding the first two source words.
 +   * If there is only one source word, use empty string for the second.
 +   */
 +  private String getFirstTwoSourceWords(final String[] source_words) {
 +    return source_words[0] + SOURCE_WORDS_SEPARATOR + ((source_words.length > 1) ? source_words[1] : "");
 +  }
 +
 +  private void binarize(HieroFormatReader grammarReader, LineReader alignment_reader) throws IOException {
 +    int counter = 0;
 +    int slice_counter = 0;
 +    int num_slices = 0;
 +
 +    boolean ready_to_flush = false;
 +    // to determine when flushing is possible
 +    String prev_first_two_source_words = null;
 +
 +    PackingTrie<SourceValue> source_trie = new PackingTrie<SourceValue>();
 +    PackingTrie<TargetValue> target_trie = new PackingTrie<TargetValue>();
 +    FeatureBuffer feature_buffer = new FeatureBuffer();
 +
 +    AlignmentBuffer alignment_buffer = null;
 +    if (packAlignments)
 +      alignment_buffer = new AlignmentBuffer();
 +
 +    TreeMap<Integer, Float> features = new TreeMap<Integer, Float>();
 +    for (Rule rule: grammarReader) {
 +      counter++;
 +      slice_counter++;
 +
 +      String lhs_word = Vocabulary.word(rule.getLHS());
 +      String[] source_words = rule.getFrenchWords().split("\\s+");
 +      String[] target_words = rule.getEnglishWords().split("\\s+");
 +      String[] feature_entries = rule.getFeatureString().split("\\s+");
 +
 +      // Reached slice limit size, indicate that we're closing up.
 +      if (!ready_to_flush
 +          && (slice_counter > approximateMaximumSliceSize
 +              || feature_buffer.overflowing()
 +              || (packAlignments && alignment_buffer.overflowing()))) {
 +        ready_to_flush = true;
 +        // store the first two source words when slice size limit was reached
 +        prev_first_two_source_words = getFirstTwoSourceWords(source_words);
 +      }
 +      // ready to flush
 +      if (ready_to_flush) {
 +        final String first_two_source_words = getFirstTwoSourceWords(source_words);
 +        // the grammar can only be partitioned at the level of first two source word changes.
 +        // Thus, we can only flush if the current first two source words differ from the ones
 +        // when the slice size limit was reached.
 +        if (!first_two_source_words.equals(prev_first_two_source_words)) {
 +          LOG.warn("ready to flush and first two words have changed ({} vs. {})",
 +              prev_first_two_source_words, first_two_source_words);
 +          LOG.info("flushing {} rules to slice.", slice_counter);
 +          flush(source_trie, target_trie, feature_buffer, alignment_buffer, num_slices);
 +          source_trie.clear();
 +          target_trie.clear();
 +          feature_buffer.clear();
 +          if (packAlignments)
 +            alignment_buffer.clear();
 +
 +          num_slices++;
 +          slice_counter = 0;
 +          ready_to_flush = false;
 +        }
 +      }
 +
 +      int alignment_index = -1;
 +      // If present, process alignments.
 +      if (packAlignments) {
 +        String alignment_line;
 +        if (grammarAlignments) {
 +          alignment_line = rule.getAlignmentString();
 +        } else {
 +          if (!alignment_reader.hasNext()) {
 +            LOG.error("No more alignments starting in line {}", counter);
 +            throw new RuntimeException("No more alignments starting in line " + counter);
 +          }
 +          alignment_line = alignment_reader.next().trim();
 +        }
 +        String[] alignment_entries = alignment_line.split("\\s");
 +        byte[] alignments = new byte[alignment_entries.length * 2];
 +        if (alignment_entries.length != 0) {
 +          for (int i = 0; i < alignment_entries.length; i++) {
 +            String[] parts = alignment_entries[i].split("-");
 +            alignments[2 * i] = Byte.parseByte(parts[0]);
 +            alignments[2 * i + 1] = Byte.parseByte(parts[1]);
 +          }
 +        }
 +        alignment_index = alignment_buffer.add(alignments);
 +      }
 +
 +      // Process features.
 +      // Implicitly sort via TreeMap, write to data buffer, remember position
 +      // to pass on to the source trie node.
 +      features.clear();
 +      int feature_count = 0;
 +      for (int f = 0; f < feature_entries.length; ++f) {
 +        String feature_entry = feature_entries[f];
 +        int feature_id;
 +        float feature_value;
 +        if (feature_entry.contains("=")) {
 +          String[] parts = feature_entry.split("=");
 +          if (parts[0].equals("Alignment"))
 +            continue;
 +          feature_id = Vocabulary.id(parts[0]);
 +          feature_value = Float.parseFloat(parts[1]);
 +        } else {
 +          feature_id = Vocabulary.id(String.valueOf(feature_count++));
 +          feature_value = Float.parseFloat(feature_entry);
 +        }
 +        if (feature_value != 0)
 +          features.put(encoderConfig.innerId(feature_id), feature_value);
 +      }
 +      int features_index = feature_buffer.add(features);
 +
 +      // Sanity check on the data block index.
 +      if (packAlignments && features_index != alignment_index) {
 +        LOG.error("Block index mismatch between features ({}) and alignments ({}).",
 +            features_index, alignment_index);
 +        throw new RuntimeException("Data block index mismatch.");
 +      }
 +
 +      // Process source side.
 +      SourceValue sv = new SourceValue(Vocabulary.id(lhs_word), features_index);
 +      int[] source = new int[source_words.length];
 +      for (int i = 0; i < source_words.length; i++) {
 +        if (FormatUtils.isNonterminal(source_words[i]))
 +          source[i] = Vocabulary.id(FormatUtils.stripNonTerminalIndex(source_words[i]));
 +        else
 +          source[i] = Vocabulary.id(source_words[i]);
 +      }
 +      source_trie.add(source, sv);
 +
 +      // Process target side.
 +      TargetValue tv = new TargetValue(sv);
 +      int[] target = new int[target_words.length];
 +      for (int i = 0; i < target_words.length; i++) {
 +        if (FormatUtils.isNonterminal(target_words[i])) {
 +          target[target_words.length - (i + 1)] = -FormatUtils.getNonterminalIndex(target_words[i]);
 +        } else {
 +          target[target_words.length - (i + 1)] = Vocabulary.id(target_words[i]);
 +        }
 +      }
 +      target_trie.add(target, tv);
 +    }
 +    // flush last slice and clear buffers
 +    flush(source_trie, target_trie, feature_buffer, alignment_buffer, num_slices);
 +  }
 +
 +  /**
 +   * Serializes the source, target and feature data structures into interlinked binary files. Target
 +   * is written first, into a skeletal (node don't carry any data) upward-pointing trie, updating
 +   * the linking source trie nodes with the position once it is known. Source and feature data are
 +   * written simultaneously. The source structure is written into a downward-pointing trie and
 +   * stores the rule's lhs as well as links to the target and feature stream. The feature stream is
 +   * prompted to write out a block
 +   * 
 +   * @param source_trie
 +   * @param target_trie
 +   * @param feature_buffer
 +   * @param id
 +   * @throws IOException
 +   */
 +  private void flush(PackingTrie<SourceValue> source_trie,
 +      PackingTrie<TargetValue> target_trie, FeatureBuffer feature_buffer,
 +      AlignmentBuffer alignment_buffer, int id) throws IOException {
 +    // Make a slice object for this piece of the grammar.
 +    PackingFileTuple slice = new PackingFileTuple("slice_" + String.format("%05d", id));
 +    // Pull out the streams for source, target and data output.
 +    DataOutputStream source_stream = slice.getSourceOutput();
 +    DataOutputStream target_stream = slice.getTargetOutput();
 +    DataOutputStream target_lookup_stream = slice.getTargetLookupOutput();
 +    DataOutputStream feature_stream = slice.getFeatureOutput();
 +    DataOutputStream alignment_stream = slice.getAlignmentOutput();
 +
 +    Queue<PackingTrie<TargetValue>> target_queue;
 +    Queue<PackingTrie<SourceValue>> source_queue;
 +
 +    // The number of bytes both written into the source stream and
 +    // buffered in the source queue.
 +    int source_position;
 +    // The number of bytes written into the target stream.
 +    int target_position;
 +
 +    // Add trie root into queue, set target position to 0 and set cumulated
 +    // size to size of trie root.
 +    target_queue = new LinkedList<PackingTrie<TargetValue>>();
 +    target_queue.add(target_trie);
 +    target_position = 0;
 +
 +    // Target lookup table for trie levels.
 +    int current_level_size = 1;
 +    int next_level_size = 0;
 +    ArrayList<Integer> target_lookup = new ArrayList<Integer>();
 +
 +    // Packing loop for upwards-pointing target trie.
 +    while (!target_queue.isEmpty()) {
 +      // Pop top of queue.
 +      PackingTrie<TargetValue> node = target_queue.poll();
 +      // Register that this is where we're writing the node to.
 +      node.address = target_position;
 +      // Tell source nodes that we're writing to this position in the file.
 +      for (TargetValue tv : node.values)
 +        tv.parent.target = node.address;
 +      // Write link to parent.
 +      if (node.parent != null)
 +        target_stream.writeInt(node.parent.address);
 +      else
 +        target_stream.writeInt(-1);
 +      target_stream.writeInt(node.symbol);
 +      // Enqueue children.
 +      for (int k : node.children.descendingKeySet()) {
 +        PackingTrie<TargetValue> child = node.children.get(k);
 +        target_queue.add(child);
 +      }
 +      target_position += node.size(false, true);
 +      next_level_size += node.children.descendingKeySet().size();
 +
 +      current_level_size--;
 +      if (current_level_size == 0) {
 +        target_lookup.add(target_position);
 +        current_level_size = next_level_size;
 +        next_level_size = 0;
 +      }
 +    }
 +    target_lookup_stream.writeInt(target_lookup.size());
 +    for (int i : target_lookup)
 +      target_lookup_stream.writeInt(i);
 +    target_lookup_stream.close();
 +
 +    // Setting up for source and data writing.
 +    source_queue = new LinkedList<PackingTrie<SourceValue>>();
 +    source_queue.add(source_trie);
 +    source_position = source_trie.size(true, false);
 +    source_trie.address = target_position;
 +
 +    // Ready data buffers for writing.
 +    feature_buffer.initialize();
 +    if (packAlignments)
 +      alignment_buffer.initialize();
 +
 +    // Packing loop for downwards-pointing source trie.
 +    while (!source_queue.isEmpty()) {
 +      // Pop top of queue.
 +      PackingTrie<SourceValue> node = source_queue.poll();
 +      // Write number of children.
 +      source_stream.writeInt(node.children.size());
 +      // Write links to children.
 +      for (int k : node.children.descendingKeySet()) {
 +        PackingTrie<SourceValue> child = node.children.get(k);
 +        // Enqueue child.
 +        source_queue.add(child);
 +        // Child's address will be at the current end of the queue.
 +        child.address = source_position;
 +        // Advance cumulated size by child's size.
 +        source_position += child.size(true, false);
 +        // Write the link.
 +        source_stream.writeInt(k);
 +        source_stream.writeInt(child.address);
 +      }
 +      // Write number of data items.
 +      source_stream.writeInt(node.values.size());
 +      // Write lhs and links to target and data.
 +      for (SourceValue sv : node.values) {
 +        int feature_block_index = feature_buffer.write(sv.data);
 +        if (packAlignments) {
 +          int alignment_block_index = alignment_buffer.write(sv.data);
 +          if (alignment_block_index != feature_block_index) {
 +            LOG.error("Block index mismatch.");
 +            throw new RuntimeException("Block index mismatch: alignment (" + alignment_block_index
 +                + ") and features (" + feature_block_index + ") don't match.");
 +          }
 +        }
 +        source_stream.writeInt(sv.lhs);
 +        source_stream.writeInt(sv.target);
 +        source_stream.writeInt(feature_block_index);
 +      }
 +    }
 +    // Flush the data stream.
 +    feature_buffer.flush(feature_stream);
 +    if (packAlignments)
 +      alignment_buffer.flush(alignment_stream);
 +
 +    target_stream.close();
 +    source_stream.close();
 +    feature_stream.close();
 +    if (packAlignments)
 +      alignment_stream.close();
 +  }
 +
 +  public void writeVocabulary() throws IOException {
 +    final String vocabularyFilename = output + File.separator + VOCABULARY_FILENAME;
 +    LOG.info("Writing vocabulary to {}", vocabularyFilename);
 +    Vocabulary.write(vocabularyFilename);
 +  }
 +
 +  /**
 +   * Integer-labeled, doubly-linked trie with some provisions for packing.
 +   * 
 +   * @author Juri Ganitkevitch
 +   * 
 +   * @param <D> The trie's value type.
 +   */
 +  class PackingTrie<D extends PackingTrieValue> {
 +    int symbol;
 +    PackingTrie<D> parent;
 +
 +    TreeMap<Integer, PackingTrie<D>> children;
 +    List<D> values;
 +
 +    int address;
 +
 +    PackingTrie() {
 +      address = -1;
 +
 +      symbol = 0;
 +      parent = null;
 +
 +      children = new TreeMap<Integer, PackingTrie<D>>();
 +      values = new ArrayList<D>();
 +    }
 +
 +    PackingTrie(PackingTrie<D> parent, int symbol) {
 +      this();
 +      this.parent = parent;
 +      this.symbol = symbol;
 +    }
 +
 +    void add(int[] path, D value) {
 +      add(path, 0, value);
 +    }
 +
 +    private void add(int[] path, int index, D value) {
 +      if (index == path.length)
 +        this.values.add(value);
 +      else {
 +        PackingTrie<D> child = children.get(path[index]);
 +        if (child == null) {
 +          child = new PackingTrie<D>(this, path[index]);
 +          children.put(path[index], child);
 +        }
 +        child.add(path, index + 1, value);
 +      }
 +    }
 +
 +    /**
 +     * Calculate the size (in ints) of a packed trie node. Distinguishes downwards pointing (parent
 +     * points to children) from upwards pointing (children point to parent) tries, as well as
 +     * skeletal (no data, just the labeled links) and non-skeletal (nodes have a data block)
 +     * packing.
 +     * 
 +     * @param downwards Are we packing into a downwards-pointing trie?
 +     * @param skeletal Are we packing into a skeletal trie?
 +     * 
 +     * @return Number of bytes the trie node would occupy.
 +     */
 +    int size(boolean downwards, boolean skeletal) {
 +      int size = 0;
 +      if (downwards) {
 +        // Number of children and links to children.
 +        size = 1 + 2 * children.size();
 +      } else {
 +        // Link to parent.
 +        size += 2;
 +      }
 +      // Non-skeletal packing: number of data items.
 +      if (!skeletal)
 +        size += 1;
 +      // Non-skeletal packing: write size taken up by data items.
 +      if (!skeletal && !values.isEmpty())
 +        size += values.size() * values.get(0).size();
 +
 +      return size;
 +    }
 +
 +    void clear() {
 +      children.clear();
 +      values.clear();
 +    }
 +  }
 +
 +  interface PackingTrieValue {
 +    int size();
 +  }
 +
 +  class SourceValue implements PackingTrieValue {
 +    int lhs;
 +    int data;
 +    int target;
 +
 +    public SourceValue() {
 +    }
 +
 +    SourceValue(int lhs, int data) {
 +      this.lhs = lhs;
 +      this.data = data;
 +    }
 +
 +    void setTarget(int target) {
 +      this.target = target;
 +    }
 +
 +    public int size() {
 +      return 3;
 +    }
 +  }
 +
 +  class TargetValue implements PackingTrieValue {
 +    SourceValue parent;
 +
 +    TargetValue(SourceValue parent) {
 +      this.parent = parent;
 +    }
 +
 +    public int size() {
 +      return 0;
 +    }
 +  }
 +
 +  abstract class PackingBuffer<T> {
 +    private byte[] backing;
 +    protected ByteBuffer buffer;
 +
 +    protected ArrayList<Integer> memoryLookup;
 +    protected int totalSize;
 +    protected ArrayList<Integer> onDiskOrder;
 +
 +    PackingBuffer() throws IOException {
 +      allocate();
 +      memoryLookup = new ArrayList<Integer>();
 +      onDiskOrder = new ArrayList<Integer>();
 +      totalSize = 0;
 +    }
 +
 +    abstract int add(T item);
 +
 +    // Allocate a reasonably-sized buffer for the feature data.
 +    private void allocate() {
 +      backing = new byte[approximateMaximumSliceSize * DATA_SIZE_ESTIMATE];
 +      buffer = ByteBuffer.wrap(backing);
 +    }
 +
 +    // Reallocate the backing array and buffer, copies data over.
 +    protected void reallocate() {
 +      if (backing.length == Integer.MAX_VALUE)
 +        return;
 +      long attempted_length = backing.length * 2l;
 +      int new_length;
 +      // Detect overflow.
 +      if (attempted_length >= Integer.MAX_VALUE)
 +        new_length = Integer.MAX_VALUE;
 +      else
 +        new_length = (int) attempted_length;
 +      byte[] new_backing = new byte[new_length];
 +      System.arraycopy(backing, 0, new_backing, 0, backing.length);
 +      int old_position = buffer.position();
 +      ByteBuffer new_buffer = ByteBuffer.wrap(new_backing);
 +      new_buffer.position(old_position);
 +      buffer = new_buffer;
 +      backing = new_backing;
 +    }
 +
 +    /**
 +     * Prepare the data buffer for disk writing.
 +     */
 +    void initialize() {
 +      onDiskOrder.clear();
 +    }
 +
 +    /**
 +     * Enqueue a data block for later writing.
 +     * 
 +     * @param block_index The index of the data block to add to writing queue.
 +     * @return The to-be-written block's output index.
 +     */
 +    int write(int block_index) {
 +      onDiskOrder.add(block_index);
 +      return onDiskOrder.size() - 1;
 +    }
 +
 +    /**
 +     * Performs the actual writing to disk in the order specified by calls to write() since the last
 +     * call to initialize().
 +     * 
 +     * @param out
 +     * @throws IOException
 +     */
 +    void flush(DataOutputStream out) throws IOException {
 +      writeHeader(out);
 +      int size;
 +      int block_address;
 +      for (int block_index : onDiskOrder) {
 +        block_address = memoryLookup.get(block_index);
 +        size = blockSize(block_index);
 +        out.write(backing, block_address, size);
 +      }
 +    }
 +
 +    void clear() {
 +      buffer.clear();
 +      memoryLookup.clear();
 +      onDiskOrder.clear();
 +    }
 +
 +    boolean overflowing() {
 +      return (buffer.position() >= DATA_SIZE_LIMIT);
 +    }
 +
 +    private void writeHeader(DataOutputStream out) throws IOException {
 +      if (out.size() == 0) {
 +        out.writeInt(onDiskOrder.size());
 +        out.writeInt(totalSize);
 +        int disk_position = headerSize();
 +        for (int block_index : onDiskOrder) {
 +          out.writeInt(disk_position);
 +          disk_position += blockSize(block_index);
 +        }
 +      } else {
 +        throw new RuntimeException("Got a used stream for header writing.");
 +      }
 +    }
 +
 +    private int headerSize() {
 +      // One integer for each data block, plus number of blocks and total size.
 +      return 4 * (onDiskOrder.size() + 2);
 +    }
 +
 +    private int blockSize(int block_index) {
 +      int block_address = memoryLookup.get(block_index);
 +      return (block_index < memoryLookup.size() - 1 ? memoryLookup.get(block_index + 1) : totalSize)
 +          - block_address;
 +    }
 +  }
 +
 +  class FeatureBuffer extends PackingBuffer<TreeMap<Integer, Float>> {
 +
 +    private IntEncoder idEncoder;
 +
 +    FeatureBuffer() throws IOException {
 +      super();
 +      idEncoder = types.getIdEncoder();
 +      LOG.info("Encoding feature ids in: {}", idEncoder.getKey());
 +    }
 +
 +    /**
 +     * Add a block of features to the buffer.
 +     * 
 +     * @param features TreeMap with the features for one rule.
 +     * @return The index of the resulting data block.
 +     */
 +    int add(TreeMap<Integer, Float> features) {
 +      int data_position = buffer.position();
 +
 +      // Over-estimate how much room this addition will need: for each
 +      // feature (ID_SIZE for label, "upper bound" of 4 for the value), plus ID_SIZE for
 +      // the number of features. If this won't fit, reallocate the buffer.
 +      int size_estimate = (4 + EncoderConfiguration.ID_SIZE) * features.size()
 +          + EncoderConfiguration.ID_SIZE;
 +      if (buffer.capacity() - buffer.position() <= size_estimate)
 +        reallocate();
 +
 +      // Write features to buffer.
 +      idEncoder.write(buffer, features.size());
 +      for (Integer k : features.descendingKeySet()) {
 +        float v = features.get(k);
 +        // Sparse features.
 +        if (v != 0.0) {
 +          idEncoder.write(buffer, k);
 +          encoderConfig.encoder(k).write(buffer, v);
 +        }
 +      }
 +      // Store position the block was written to.
 +      memoryLookup.add(data_position);
 +      // Update total size (in bytes).
 +      totalSize = buffer.position();
 +
 +      // Return block index.
 +      return memoryLookup.size() - 1;
 +    }
 +  }
 +
 +  class AlignmentBuffer extends PackingBuffer<byte[]> {
 +
 +    AlignmentBuffer() throws IOException {
 +      super();
 +    }
 +
 +    /**
 +     * Add a rule alignments to the buffer.
 +     * 
 +     * @param alignments a byte array with the alignment points for one rule.
 +     * @return The index of the resulting data block.
 +     */
 +    int add(byte[] alignments) {
 +      int data_position = buffer.position();
 +      int size_estimate = alignments.length + 1;
 +      if (buffer.capacity() - buffer.position() <= size_estimate)
 +        reallocate();
 +
 +      // Write alignment points to buffer.
 +      buffer.put((byte) (alignments.length / 2));
 +      buffer.put(alignments);
 +
 +      // Store position the block was written to.
 +      memoryLookup.add(data_position);
 +      // Update total size (in bytes).
 +      totalSize = buffer.position();
 +      // Return block index.
 +      return memoryLookup.size() - 1;
 +    }
 +  }
 +
 +  class PackingFileTuple implements Comparable<PackingFileTuple> {
 +    private File sourceFile;
 +    private File targetLookupFile;
 +    private File targetFile;
 +
 +    private File featureFile;
 +    private File alignmentFile;
 +
 +    PackingFileTuple(String prefix) {
 +      sourceFile = new File(output + File.separator + prefix + ".source");
 +      targetFile = new File(output + File.separator + prefix + ".target");
 +      targetLookupFile = new File(output + File.separator + prefix + ".target.lookup");
 +      featureFile = new File(output + File.separator + prefix + ".features");
 +
 +      alignmentFile = null;
 +      if (packAlignments)
 +        alignmentFile = new File(output + File.separator + prefix + ".alignments");
 +
 +      LOG.info("Allocated slice: {}", sourceFile.getAbsolutePath());
 +    }
 +
 +    DataOutputStream getSourceOutput() throws IOException {
 +      return getOutput(sourceFile);
 +    }
 +
 +    DataOutputStream getTargetOutput() throws IOException {
 +      return getOutput(targetFile);
 +    }
 +
 +    DataOutputStream getTargetLookupOutput() throws IOException {
 +      return getOutput(targetLookupFile);
 +    }
 +
 +    DataOutputStream getFeatureOutput() throws IOException {
 +      return getOutput(featureFile);
 +    }
 +
 +    DataOutputStream getAlignmentOutput() throws IOException {
 +      if (alignmentFile != null)
 +        return getOutput(alignmentFile);
 +      return null;
 +    }
 +
 +    private DataOutputStream getOutput(File file) throws IOException {
 +      if (file.createNewFile()) {
 +        return new DataOutputStream(new BufferedOutputStream(new FileOutputStream(file)));
 +      } else {
 +        throw new RuntimeException("File doesn't exist: " + file.getName());
 +      }
 +    }
 +
 +    long getSize() {
 +      return sourceFile.length() + targetFile.length() + featureFile.length();
 +    }
 +
 +    @Override
 +    public int compareTo(PackingFileTuple o) {
 +      if (getSize() > o.getSize()) {
 +        return -1;
 +      } else if (getSize() < o.getSize()) {
 +        return 1;
 +      } else {
 +        return 0;
 +      }
 +    }
 +  }
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/src/main/java/org/apache/joshua/util/Constants.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/util/Constants.java
index 0000000,0000000..45d2aef
new file mode 100644
--- /dev/null
+++ b/src/main/java/org/apache/joshua/util/Constants.java
@@@ -1,0 -1,0 +1,36 @@@
++/*
++ * Licensed to the Apache Software Foundation (ASF) under one
++ * or more contributor license agreements.  See the NOTICE file
++ * distributed with this work for additional information
++ * regarding copyright ownership.  The ASF licenses this file
++ * to you under the Apache License, Version 2.0 (the
++ * "License"); you may not use this file except in compliance
++ * with the License.  You may obtain a copy of the License at
++ *
++ *  http://www.apache.org/licenses/LICENSE-2.0
++ *
++ * Unless required by applicable law or agreed to in writing,
++ * software distributed under the License is distributed on an
++ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
++ * KIND, either express or implied.  See the License for the
++ * specific language governing permissions and limitations
++ * under the License.
++ */
++package org.apache.joshua.util;
++
++/***
++ * One day, all constants should be moved here (many are in Vocabulary).
++ * 
++ * @author Matt Post <po...@cs.jhu.edu>
++ */
++
++public final class Constants {
++  public static String defaultNT = "[X]";
++
++  public static final String START_SYM = "<s>";
++  public static final String STOP_SYM = "</s>";
++  public static final String UNKNOWN_WORD = "<unk>";
++  
++  public static final String fieldDelimiter = "\\s\\|{3}\\s";
++  public static final String spaceSeparator = "\\s+";
++}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/src/main/java/org/apache/joshua/util/FileUtility.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/util/FileUtility.java
index 9dad55a,0000000..a36b07f
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/util/FileUtility.java
+++ b/src/main/java/org/apache/joshua/util/FileUtility.java
@@@ -1,319 -1,0 +1,318 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.util;
 +
 +import java.io.BufferedReader;
 +import java.io.BufferedWriter;
 +import java.io.Closeable;
 +import java.io.File;
 +import java.io.FileDescriptor;
 +import java.io.FileInputStream;
- import java.io.FileNotFoundException;
 +import java.io.FileOutputStream;
 +import java.io.FileReader;
 +import java.io.IOException;
 +import java.io.InputStream;
 +import java.io.OutputStream;
 +import java.io.OutputStreamWriter;
 +import java.nio.charset.Charset;
 +import java.util.LinkedList;
 +import java.util.List;
 +import java.util.Scanner;
 +
 +/**
 + * utility functions for file operations
 + * 
 + * @author Zhifei Li, zhifei.work@gmail.com
 + * @author wren ng thornton wren@users.sourceforge.net
 + * @since 28 February 2009
 + */
 +public class FileUtility {
 +  public static String DEFAULT_ENCODING = "UTF-8";
 +
 +  /*
 +   * Note: charset name is case-agnostic "UTF-8" is the canonical name "UTF8", "unicode-1-1-utf-8"
 +   * are aliases Java doesn't distinguish utf8 vs UTF-8 like Perl does
 +   */
 +  private static final Charset FILE_ENCODING = Charset.forName(DEFAULT_ENCODING);
 +
 +  /**
 +   * Warning, will truncate/overwrite existing files
 +   * @param filename a file for which to obtain a writer
 +   * @return the buffered writer object
 +   * @throws IOException if there is a problem reading the inout file
 +   */
 +  public static BufferedWriter getWriteFileStream(String filename) throws IOException {
 +    return new BufferedWriter(new OutputStreamWriter(
 +    // TODO: add GZIP
 +        filename.equals("-") ? new FileOutputStream(FileDescriptor.out) : new FileOutputStream(
 +            filename, false), FILE_ENCODING));
 +  }
 +
 +  /**
 +   * Recursively delete the specified file or directory.
 +   * 
 +   * @param f File or directory to delete
 +   * @return <code>true</code> if the specified file or directory was deleted, <code>false</code>
 +   *         otherwise
 +   */
 +  public static boolean deleteRecursively(File f) {
 +    if (null != f) {
 +      if (f.isDirectory())
 +        for (File child : f.listFiles())
 +          deleteRecursively(child);
 +      return f.delete();
 +    } else {
 +      return false;
 +    }
 +  }
 +
 +  /**
 +   * Writes data from the integer array to disk as raw bytes, overwriting the old file if present.
 +   * 
 +   * @param data The integer array to write to disk.
 +   * @param filename The filename where the data should be written.
 +   * @throws IOException if there is a problem writing to the output file
 +   * @return the FileOutputStream on which the bytes were written
 +   */
 +  public static FileOutputStream writeBytes(int[] data, String filename) throws IOException {
 +    FileOutputStream out = new FileOutputStream(filename, false);
 +    writeBytes(data, out);
 +    return out;
 +  }
 +
 +  /**
 +   * Writes data from the integer array to disk as raw bytes.
 +   * 
 +   * @param data The integer array to write to disk.
 +   * @param out The output stream where the data should be written.
 +   * @throws IOException if there is a problem writing bytes
 +   */
 +  public static void writeBytes(int[] data, OutputStream out) throws IOException {
 +
 +    byte[] b = new byte[4];
 +
 +    for (int word : data) {
 +      b[0] = (byte) ((word >>> 24) & 0xFF);
 +      b[1] = (byte) ((word >>> 16) & 0xFF);
 +      b[2] = (byte) ((word >>> 8) & 0xFF);
 +      b[3] = (byte) ((word >>> 0) & 0xFF);
 +
 +      out.write(b);
 +    }
 +  }
 +
 +  public static void copyFile(String srFile, String dtFile) throws IOException {
 +    try {
 +      File f1 = new File(srFile);
 +      File f2 = new File(dtFile);
 +      copyFile(f1, f2);
 +    } catch (IOException e) {
 +      throw new RuntimeException(e);
 +    }
 +  }
 +
 +  public static void copyFile(File srFile, File dtFile) throws IOException {
 +    try {
 +
 +      InputStream in = new FileInputStream(srFile);
 +
 +      // For Append the file.
 +      // OutputStream out = new FileOutputStream(f2,true);
 +
 +      // For Overwrite the file.
 +      OutputStream out = new FileOutputStream(dtFile);
 +
 +      byte[] buf = new byte[1024];
 +      int len;
 +      while ((len = in.read(buf)) > 0) {
 +        out.write(buf, 0, len);
 +      }
 +      in.close();
 +      out.close();
 +      System.out.println("File copied.");
 +    } catch (IOException e) {
 +      throw new RuntimeException(e);
 +    }
 +  }
 +
 +  static public boolean deleteFile(String fileName) {
 +
 +    File f = new File(fileName);
 +
 +    // Make sure the file or directory exists and isn't write protected
 +    if (!f.exists())
 +      System.out.println("Delete: no such file or directory: " + fileName);
 +
 +    if (!f.canWrite())
 +      System.out.println("Delete: write protected: " + fileName);
 +
 +    // If it is a directory, make sure it is empty
 +    if (f.isDirectory()) {
 +      String[] files = f.list();
 +      if (files.length > 0)
 +        System.out.println("Delete: directory not empty: " + fileName);
 +    }
 +
 +    // Attempt to delete it
 +    boolean success = f.delete();
 +
 +    if (!success)
 +      System.out.println("Delete: deletion failed");
 +
 +    return success;
 +
 +  }
 +
 +  /**
 +   * Returns the base directory of the file. For example, dirname('/usr/local/bin/emacs') -&gt;
 +   * '/usr/local/bin'
 +   * @param fileName the input path
 +   * @return the parent path
 +   */
 +  static public String dirname(String fileName) {
 +    if (fileName.indexOf(File.separator) != -1)
 +      return fileName.substring(0, fileName.lastIndexOf(File.separator));
 +
 +    return ".";
 +  }
 +
 +  public static void createFolderIfNotExisting(String folderName) {
 +    File f = new File(folderName);
 +    if (!f.isDirectory()) {
 +      System.out.println(" createFolderIfNotExisting -- Making directory: " + folderName);
 +      f.mkdirs();
 +    } else {
 +      System.out.println(" createFolderIfNotExisting -- Directory: " + folderName
 +          + " already existed");
 +    }
 +  }
 +
 +  public static void closeCloseableIfNotNull(Closeable fileWriter) {
 +    if (fileWriter != null) {
 +      try {
 +        fileWriter.close();
 +      } catch (IOException e) {
 +        e.printStackTrace();
 +      }
 +    }
 +  }
 +
 +  /**
 +   * Returns the directory were the program has been started,
 +   * the base directory you will implicitly get when specifying no
 +   * full path when e.g. opening a file
 +   * @return the current 'user.dir'
 +   */
 +  public static String getWorkingDirectory() {
 +    return System.getProperty("user.dir");
 +  }
 +
 +  /**
 +   * Method to handle standard IO exceptions. catch (Exception e) {Utility.handleIO_exception(e);}
 +   * @param e an input {@link java.lang.Exception}
 +   */
 +  public static void handleExceptions(Exception e) {
 +    throw new RuntimeException(e);
 +  }
 +
 +  /**
 +   * Convenience method to get a full file as a String
 +   * @param file the input {@link java.io.File}
 +   * @return The file as a String. Lines are separated by newline character.
 +   */
 +  public static String getFileAsString(File file) {
 +    String result = "";
 +    List<String> lines = getLines(file, true);
 +    for (int i = 0; i < lines.size() - 1; i++) {
 +      result += lines.get(i) + "\n";
 +    }
 +    if (!lines.isEmpty()) {
 +      result += lines.get(lines.size() - 1);
 +    }
 +    return result;
 +  }
 +
 +  /**
 +   * This method returns a List of String. Each element of the list corresponds to a line from the
 +   * input file. The boolean keepDuplicates in the input determines if duplicate lines are allowed
 +   * in the output LinkedList or not.
 +   * @param file the input file
 +   * @param keepDuplicates whether to retain duplicate lines
 +   * @return a {@link java.util.List} of lines
 +   */
 +  static public List<String> getLines(File file, boolean keepDuplicates) {
 +    LinkedList<String> list = new LinkedList<String>();
 +    String line = "";
 +    try {
 +      BufferedReader InputReader = new BufferedReader(new FileReader(file));
 +      for (;;) { // this loop writes writes in a Sting each sentence of
 +        // the file and process it
 +        int current = InputReader.read();
 +        if (current == -1 || current == '\n') {
 +          if (keepDuplicates || !list.contains(line))
 +            list.add(line);
 +          line = "";
 +          if (current == -1)
 +            break; // EOF
 +        } else
 +          line += (char) current;
 +      }
 +      InputReader.close();
 +    } catch (Exception e) {
 +      handleExceptions(e);
 +    }
 +    return list;
 +  }
 +
 +  /**
 +   * Returns a Scanner of the inputFile using a specific encoding
 +   * 
 +   * @param inputFile the file for which to get a {@link java.util.Scanner} object
 +   * @param encoding the encoding to use within the Scanner
 +   * @return a {@link java.util.Scanner} object for a given file
 +   */
 +  public static Scanner getScanner(File inputFile, String encoding) {
 +    Scanner scan = null;
 +    try {
 +      scan = new Scanner(inputFile, encoding);
 +    } catch (IOException e) {
 +      FileUtility.handleExceptions(e);
 +    }
 +    return scan;
 +  }
 +
 +  /**
 +   * Returns a Scanner of the inputFile using default encoding
 +   * 
 +   * @param inputFile the file for which to get a {@link java.util.Scanner} object
 +   * @return a {@link java.util.Scanner} object for a given file
 +   */
 +  public static Scanner getScanner(File inputFile) {
 +    return getScanner(inputFile, DEFAULT_ENCODING);
 +  }
 +
 +  static public String getFirstLineInFile(File inputFile) {
 +    Scanner scan = FileUtility.getScanner(inputFile);
 +    if (!scan.hasNextLine())
 +      return null;
 +    String line = scan.nextLine();
 +    scan.close();
 +    return line;
 +  }
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/src/main/java/org/apache/joshua/util/io/LineReader.java
----------------------------------------------------------------------
diff --cc src/main/java/org/apache/joshua/util/io/LineReader.java
index e61e79a,0000000..5122994
mode 100644,000000..100644
--- a/src/main/java/org/apache/joshua/util/io/LineReader.java
+++ b/src/main/java/org/apache/joshua/util/io/LineReader.java
@@@ -1,369 -1,0 +1,368 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.util.io;
 +
 +import java.io.BufferedReader;
 +import java.io.FileDescriptor;
 +import java.io.FileInputStream;
 +import java.io.IOException;
 +import java.io.InputStream;
 +import java.io.InputStreamReader;
 +import java.io.File;
 +import java.nio.charset.Charset;
 +import java.util.Iterator;
 +import java.util.NoSuchElementException;
 +import java.util.zip.GZIPInputStream;
- import java.util.zip.ZipException;
 +
 +import org.apache.joshua.decoder.Decoder;
 +
 +/**
 + * This class provides an Iterator interface to a BufferedReader. This covers the most common
 + * use-cases for reading from files without ugly code to check whether we got a line or not.
 + * 
 + * @author wren ng thornton wren@users.sourceforge.net
 + * @author Matt Post post@cs.jhu.edu
 + */
 +public class LineReader implements Reader<String> {
 +
 +  /*
 +   * Note: charset name is case-agnostic "UTF-8" is the canonical name "UTF8", "unicode-1-1-utf-8"
 +   * are aliases Java doesn't distinguish utf8 vs UTF-8 like Perl does
 +   */
 +  private static final Charset FILE_ENCODING = Charset.forName("UTF-8");
 +
 +  /*
 +   * The reader and its underlying input stream. We need to keep a hold of the underlying
 +   * input stream so that we can query how many raw bytes it's read (for a generic progress
 +   * meter that works across GZIP'ed and plain text files).
 +   */
 +  private BufferedReader reader;
 +  private ProgressInputStream rawStream;
 +
 +  private String buffer;
 +  private IOException error;
 +
 +  private int lineno = 0;
 +  
 +  private boolean display_progress = false;
 +  
 +  private int progress = 0;
 +
 +  // ===============================================================
 +  // Constructors and destructors
 +  // ===============================================================
 +
 +  /**
 +   * Opens a file for iterating line by line. The special "-" filename can be used to specify
 +   * STDIN. GZIP'd files are tested for automatically.
 +   * 
 +   * @param filename the file to be opened ("-" for STDIN)
 +   * @throws IOException if there is an error reading the input file
 +   */
 +  public LineReader(String filename) throws IOException {
 +    
 +    display_progress = (Decoder.VERBOSE >= 1);
 +    
 +    progress = 0;
 +    
 +    InputStream stream = null; 
 +    long totalBytes = -1;
 +    if (filename.equals("-")) {
 +      rawStream = null;
 +      stream = new FileInputStream(FileDescriptor.in);
 +    } else {
 +      totalBytes = new File(filename).length();
 +      rawStream = new ProgressInputStream(new FileInputStream(filename), totalBytes);
-       
++
 +      try {
 +        stream = new GZIPInputStream(rawStream);
-       } catch (ZipException e) {
++      } catch (Exception e) {
 +        // GZIP ate a byte, so reset
 +        rawStream.close();
 +        stream = rawStream = new ProgressInputStream(new FileInputStream(filename), totalBytes);
 +      }
 +    } 
 +    
 +    this.reader = new BufferedReader(new InputStreamReader(stream, FILE_ENCODING));
 +  }
 +  
 +  public LineReader(String filename, boolean show_progress) throws IOException {
 +    this(filename);
 +    display_progress = (Decoder.VERBOSE >= 1 && show_progress);
 +  }
 +
 +
 +  /**
 +   * Wraps an InputStream for iterating line by line. Stream encoding is assumed to be UTF-8.
 +   * @param in an {@link java.io.InputStream} to wrap and iterate over line by line
 +   */
 +  public LineReader(InputStream in) {
 +    this.reader = new BufferedReader(new InputStreamReader(in, FILE_ENCODING));
 +    display_progress = false;
 +  }
 +  
 +  /**
 +   * Chain to the underlying {@link ProgressInputStream}. 
 +   * 
 +   * @return an integer from 0..100, indicating how much of the file has been read.
 +   */
 +  public int progress() {
 +    return rawStream == null ? 0 : rawStream.progress();
 +  }
 +  
 +  /**
 +   * This method will close the file handle, and will raise any exceptions that occured during
 +   * iteration. The method is idempotent, and all calls after the first are no-ops (unless the
 +   * thread was interrupted or killed). For correctness, you <b>must</b> call this method before the
 +   * object falls out of scope.
 +   * @throws IOException if there is an error closing the file handler
 +   */
 +  public void close() throws IOException {
 +
 +    this.buffer = null; // Just in case it's a large string
 +
 +    if (null != this.reader) {
 +      try {
 +        // We assume the wrappers will percolate this down.
 +        this.reader.close();
 +
 +      } catch (IOException e) {
 +        // We need to trash our cached error for idempotence.
 +        // Presumably the closing error is the more important
 +        // one to throw.
 +        this.error = null;
 +        throw e;
 +
 +      } finally {
 +        this.reader = null;
 +      }
 +    }
 +
 +    if (null != this.error) {
 +      IOException e = this.error;
 +      this.error = null;
 +      throw e;
 +    }
 +  }
 +
 +
 +  /**
 +   * We attempt to avoid leaking file descriptors if you fail to call close before the object falls
 +   * out of scope. However, the language spec makes <b>no guarantees</b> about timeliness of garbage
 +   * collection. It is a bug to rely on this method to release the resources. Also, the garbage
 +   * collector will discard any exceptions that have queued up, without notifying the application in
 +   * any way.
 +   * 
 +   * Having a finalizer means the JVM can't do "fast allocation" of LineReader objects (or
 +   * subclasses). This isn't too important due to disk latency, but may be worth noting.
 +   * 
 +   * @see <a
 +   *      href="http://java2go.blogspot.com/2007/09/javaone-2007-performance-tips-2-finish.html">Performance
 +   *      Tips</a>
 +   * @see <a
 +   *      href="http://www.javaworld.com/javaworld/jw-06-1998/jw-06-techniques.html?page=1">Techniques</a>
 +   */
 +  protected void finalize() throws Throwable {
 +    try {
 +      this.close();
 +    } catch (IOException e) {
 +      // Do nothing. The GC will discard the exception
 +      // anyways, but it may cause us to linger on the heap.
 +    } finally {
 +      super.finalize();
 +    }
 +  }
 +
 +
 +
 +  // ===============================================================
 +  // Reader
 +  // ===============================================================
 +
 +  // Copied from interface documentation.
 +  /** Determine if the reader is ready to read a line. */
 +  public boolean ready() throws IOException {
 +    return this.reader.ready();
 +  }
 +
 +
 +  /**
 +   * This method is like next() except that it throws the IOException directly. If there are no
 +   * lines to be read then null is returned.
 +   */
 +  public String readLine() throws IOException {
 +    if (this.hasNext()) {
 +      String line = this.buffer;
 +      this.buffer = null;
 +      return line;
 +
 +    } else {
 +      if (null != this.error) {
 +        IOException e = this.error;
 +        this.error = null;
 +        throw e;
 +      }
 +      return null;
 +    }
 +  }
 +
 +
 +  // ===============================================================
 +  // Iterable -- because sometimes Java can be very stupid
 +  // ===============================================================
 +
 +  /** Return self as an iterator. */
 +  public Iterator<String> iterator() {
 +    return this;
 +  }
 +
 +
 +  // ===============================================================
 +  // Iterator
 +  // ===============================================================
 +
 +  // Copied from interface documentation.
 +  /**
 +   * Returns <code>true</code> if the iteration has more elements. (In other words, returns
 +   * <code>true</code> if <code>next</code> would return an element rather than throwing an
 +   * exception.)
 +   */
 +  public boolean hasNext() {
 +    if (null != this.buffer) {
 +      return true;
 +
 +    } else if (null != this.error) {
 +      return false;
 +
 +    } else {
 +      // We're not allowed to throw IOException from within Iterator
 +      try {
 +        this.buffer = this.reader.readLine();
 +      } catch (IOException e) {
 +        this.buffer = null;
 +        this.error = e;
 +        return false;
 +      }
 +      return (null != this.buffer);
 +    }
 +  }
 +
 +
 +  /**
 +   * Return the next line of the file. If an error is encountered, NoSuchElementException is thrown.
 +   * The actual IOException encountered will be thrown later, when the LineReader is closed. Also if
 +   * there is no line to be read then NoSuchElementException is thrown.
 +   */
 +  public String next() throws NoSuchElementException {
 +    if (this.hasNext()) {
 +      if (display_progress) {
 +        int newProgress = (reader != null) ? progress() : 100;
 +//        System.err.println(String.format("OLD %d NEW %d", progress, newProgress));
 +        
 +        if (newProgress > progress) {
 +          for (int i = progress + 1; i <= newProgress; i++)
 +            if (i == 97) {
 +              System.err.print("1");
 +            } else if (i == 98) {
 +              System.err.print("0");
 +            } else if (i == 99) {
 +              System.err.print("0");
 +            } else if (i == 100) {
 +              System.err.println("%");
 +            } else if (i % 10 == 0) {
 +              System.err.print(String.format("%d", i));
 +              System.err.flush();
 +            } else if ((i - 1) % 10 == 0)
 +              ; // skip at 11 since 10, 20, etc take two digits
 +            else {
 +              System.err.print(".");
 +              System.err.flush();
 +            }
 +          progress = newProgress;
 +        }
 +      }
 +      
 +      String line = this.buffer;
 +      this.lineno++;
 +      this.buffer = null;
 +      return line;
 +    } else {
 +      throw new NoSuchElementException();
 +    }
 +  }
 +  
 +  /* Get the line number of the last line that was returned */
 +  public int lineno() {
 +    return this.lineno;
 +  }
 +
 +  /** Unsupported. */
 +  public void remove() throws UnsupportedOperationException {
 +    throw new UnsupportedOperationException();
 +  }
 +
 +
 +  /**
 +   * Iterates over all lines, ignoring their contents, and returns the count of lines. If some lines
 +   * have already been read, this will return the count of remaining lines. Because no lines will
 +   * remain after calling this method, we implicitly call close.
 +   * 
 +   * @return the number of lines read
 +   * @throws IOException if there is an error reading lines
 +   */
 +  public int countLines() throws IOException {
 +    int lines = 0;
 +
 +    while (this.hasNext()) {
 +      this.next();
 +      lines++;
 +    }
 +    this.close();
 +
 +    return lines;
 +  }
 +
 +  /** 
 +   * Example usage code.
 +   * @param args an input file
 +   */
 +  public static void main(String[] args) {
 +    if (1 != args.length) {
 +      System.out.println("Usage: java LineReader filename");
 +      System.exit(1);
 +    }
 +
 +    try {
 +
 +      LineReader in = new LineReader(args[0]);
 +      try {
 +        for (String line : in) {
 +
 +          System.out.println(line);
 +
 +        }
 +      } finally {
 +        in.close();
 +      }
 +
 +    } catch (IOException e) {
 +      e.printStackTrace();
 +    }
 +  }
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/src/main/resources/log4j.properties
----------------------------------------------------------------------
diff --cc src/main/resources/log4j.properties
index 1716c1c,0000000..acca5e9
mode 100644,000000..100644
--- a/src/main/resources/log4j.properties
+++ b/src/main/resources/log4j.properties
@@@ -1,20 -1,0 +1,20 @@@
 +# Licensed to the Apache Software Foundation (ASF) under one or more
 +# contributor license agreements.  See the NOTICE file distributed with
 +# this work for additional information regarding copyright ownership.
 +# The ASF licenses this file to You under the Apache License, Version 2.0
 +# (the "License"); you may not use this file except in compliance with
 +# the License.  You may obtain a copy of the License at
 +#
 +#     http://www.apache.org/licenses/LICENSE-2.0
 +#
 +# Unless required by applicable law or agreed to in writing, software
 +# distributed under the License is distributed on an "AS IS" BASIS,
 +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 +# See the License for the specific language governing permissions and
 +# limitations under the License.
 +
 +# log4j settings
- log4j.rootLogger=WARN, stderr
++log4j.rootLogger=WARN, stdout
 +log4j.appender.stdout=org.apache.log4j.ConsoleAppender
- log4j.appender.stdout.Target=System.out
++log4j.appender.stdout.Target=System.err
 +log4j.appender.stdout.layout=org.apache.log4j.SimpleLayout

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/src/test/java/org/apache/joshua/corpus/CorpusArrayTest.java
----------------------------------------------------------------------
diff --cc src/test/java/org/apache/joshua/corpus/CorpusArrayTest.java
index de6f32e,0000000..19cb20c
mode 100644,000000..100644
--- a/src/test/java/org/apache/joshua/corpus/CorpusArrayTest.java
+++ b/src/test/java/org/apache/joshua/corpus/CorpusArrayTest.java
@@@ -1,191 -1,0 +1,177 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.corpus;
 +
- import java.io.File;
- import java.io.IOException;
- import java.io.PrintStream;
- import java.util.Date;
 +import java.util.logging.Logger;
 +
- //import org.apache.joshua.corpus.CorpusArray; 
- import org.apache.joshua.corpus.Phrase; 
- //import org.apache.joshua.corpus.mm.MemoryMappedCorpusArray; 
- //import org.apache.joshua.corpus.suffix_array.SuffixArrayFactory; 
- import org.apache.joshua.corpus.Vocabulary; 
- //import org.apache.joshua.util.FormatUtil; 
- 
- import org.testng.Assert;
- import org.testng.annotations.Test;
- 
 +public class CorpusArrayTest {
 +
 +  /** Logger for this class. */
 +  private static Logger logger =
 +      Logger.getLogger(CorpusArrayTest.class.getName());
 +}
 +
 +//  @Test
 +//  public void writePartsToDisk() {
 +//
 +//    String filename = "data/tiny.en";
 +//    int numSentences = 5;  // Should be 5 sentences in tiny.en
 +//    int numWords = 89;     // Should be 89 words in tiny.en
 +//
 +//
 +//    try {
 +//
 +//      // FIX: can't use createVocabulary(String) because we set numWords and numSentences
 +//      Vocabulary vocab = new Vocabulary();
 +//      SuffixArrayFactory.createVocabulary(filename, vocab);
 +//      Corpus corpus = SuffixArrayFactory.createCorpusArray(filename, vocab, numWords, numSentences);
 +//
 +//      corpus.writeWordIDsToFile(filename+".bin");
 +//      corpus.writeSentenceLengthsToFile(filename+".sbin");
 +//
 +//      MemoryMappedCorpusArray mmCorpus = new MemoryMappedCorpusArray(corpus.getVocabulary(), filename+".bin", numWords*4, filename+".sbin", numSentences*4);
 +//
 +//      // For each word in the corpus,
 +//      for (int i=0; i<corpus.size(); i++) {
 +//
 +//        // Verify that the memory-mapped corpus and the in-memory corpus have the same value
 +//        Assert.assertEquals(mmCorpus.getWordID(i), corpus.getWordID(i));
 +//      }
 +//
 +//
 +//      // For each sentence in the corpus
 +//      for (int i=0; i<corpus.sentences.length; i++) {
 +//
 +//        // Verify that the sentence position in the memory-mapped corpus and the in-memory corpus have the same value
 +//        Assert.assertEquals(corpus.getSentencePosition(i), mmCorpus.getSentencePosition(i));
 +//      }
 +//
 +//    } catch (IOException e) {
 +//      Assert.fail(e.getLocalizedMessage());
 +//    }
 +//
 +//  }
 +//
 +//  @Test
 +//  public void iterate() {
 +//
 +//    String[] sentences = {
 +//        "scientists complete sequencing of the chromosome linked to early dementia",
 +//        "( afp , paris , january 2 ) an international team of scientists said that they have completed the sequencing of human chromosome 14 that is linked to many diseases , including the early-onset alzheimer's that may strike people in their 30s .",
 +//        "this is the fourth chromosome whose sequence has been completed to date . it comprises more than 87 million pairs of dna .",
 +//        "this study published in the weekly british scientific journal nature illustrates that the sequence of chromosome 14 comprises 1,050 genes and gene fragments .",
 +//        "the goal of geneticists is to provide diagnostic tools to identify defective genes that cause diseases so as to arrive eventually at treatments that can prevent those genes from malfunctioning ."
 +//    };
 +//
 +//
 +//
 +//    // Tell System.out and System.err to use UTF8
 +//    FormatUtil.useUTF8();
 +//
 +//    try {
 +//
 +//      File sourceFile = File.createTempFile("source", new Date().toString());
 +//      PrintStream sourcePrintStream = new PrintStream(sourceFile, "UTF-8");
 +//      for (String sentence : sentences) {
 +//        sourcePrintStream.println(sentence);
 +//      }
 +//      sourcePrintStream.close();
 +//      String corpusFileName = sourceFile.getAbsolutePath();
 +//
 +//      Vocabulary vocabulary;
 +//
 +//      logger.fine("Constructing vocabulary from file " + corpusFileName);
 +//      vocabulary = new Vocabulary();
 +//      int[] lengths = Vocabulary.initializeVocabulary(corpusFileName, vocabulary, true);
 +//
 +//      logger.fine("Constructing corpus array from file " + corpusFileName);
 +//      Corpus corpus = SuffixArrayFactory.createCorpusArray(corpusFileName, vocabulary, lengths[0], lengths[1]);
 +//
 +//      int expectedIndex = 0;
 +//      for (int actualIndex : corpus.corpusPositions()) {
 +//        Assert.assertEquals(actualIndex, expectedIndex);
 +//        expectedIndex += 1;
 +//      }
 +//
 +//      Assert.assertEquals(corpus.size(), expectedIndex);
 +//
 +//
 +//    } catch (IOException e) {
 +//      Assert.fail("Unable to write temporary file. " + e.toString());
 +//    }
 +//
 +//
 +//
 +//  }
 +//
 +//
 +//  @Test
 +//  public void writeAllToDisk() throws ClassNotFoundException {
 +//
 +//    String filename = "data/tiny.en";
 +//    int numSentences = 5;  // Should be 5 sentences in tiny.en
 +//    int numWords = 89;     // Should be 89 words in tiny.en
 +//
 +//
 +//    try {
 +//
 +//      // FIX: can't use createVocabulary(String) because we set numWords and numSentences
 +//      Vocabulary vocab = new Vocabulary();
 +//      Vocabulary.initializeVocabulary(filename, vocab, true);
 +//      CorpusArray corpus = SuffixArrayFactory.createCorpusArray(filename, vocab, numWords, numSentences);
 +//
 +//      corpus.write(filename+".corpus", filename+".vocab", "UTF-8");
 +//
 +//      MemoryMappedCorpusArray mmCorpus = new MemoryMappedCorpusArray(filename+".corpus", filename+".vocab");
 +//
 +//      Assert.assertEquals(mmCorpus.size(), corpus.size());
 +//      Assert.assertEquals(mmCorpus.getNumSentences(), corpus.getNumSentences());
 +//
 +//      // For each word in the corpus,
 +//      for (int i=0; i<corpus.size(); i++) {
 +//
 +//        // Verify that the memory-mapped corpus and the in-memory corpus have the same value
 +//        Assert.assertEquals(mmCorpus.getWordID(i), corpus.getWordID(i));
 +//      }
 +//
 +//
 +//      // For each sentence in the corpus
 +//      for (int i=0; i<corpus.sentences.length; i++) {
 +//
 +//        // Verify that the sentence start position in the memory-mapped corpus and the in-memory corpus have the same value
 +//        Assert.assertEquals(mmCorpus.getSentencePosition(i), corpus.getSentencePosition(i));
 +//
 +//        // Verify that the sentence end position in the memory-mapped corpus and the in-memory corpus have the same value
 +//        Assert.assertEquals(mmCorpus.getSentenceEndPosition(i), corpus.getSentenceEndPosition(i));
 +//
 +//        // Verify that the phrase corresponding to this sentence is the same
 +//        Phrase sentence = corpus.getSentence(i);
 +//        Phrase mmSentence = mmCorpus.getSentence(i);
 +//        Assert.assertNotNull(sentence);
 +//        Assert.assertNotNull(mmSentence);
 +//        Assert.assertEquals(mmSentence, sentence);
 +//      }
 +//
 +//    } catch (IOException e) {
 +//      Assert.fail(e.getLocalizedMessage());
 +//    }
 +//
 +//  }
 +//
 +//}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/src/test/java/org/apache/joshua/corpus/VocabularyTest.java
----------------------------------------------------------------------
diff --cc src/test/java/org/apache/joshua/corpus/VocabularyTest.java
index a282ba3,0000000..0c9ea15
mode 100644,000000..100644
--- a/src/test/java/org/apache/joshua/corpus/VocabularyTest.java
+++ b/src/test/java/org/apache/joshua/corpus/VocabularyTest.java
@@@ -1,135 -1,0 +1,136 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.corpus;
 +
- import static org.junit.Assert.*;
++import static org.apache.joshua.util.FormatUtils.isNonterminal;
++import static org.junit.Assert.assertEquals;
++import static org.junit.Assert.assertFalse;
++import static org.junit.Assert.assertTrue;
 +
 +import java.io.File;
 +import java.io.IOException;
 +
 +import org.apache.joshua.util.FormatUtils;
 +import org.junit.After;
 +import org.junit.Before;
 +import org.junit.Rule;
 +import org.junit.Test;
 +import org.junit.rules.TemporaryFolder;
 +
 +public class VocabularyTest {
 +  private static final String WORD1 = "word1";
 +  private static final String WORD2 = "word2";
 +  private static final String NON_TERMINAL = "[X]";
 +  private static final String GOAL = "[GOAL]";
 +
 +  @Before
 +  public void init() {
 +    Vocabulary.clear();
 +  }
 +  
 +  @After
 +  public void deinit() {
 +    Vocabulary.clear();
 +  }
 +  
 +  @Test
 +  public void givenVocabulary_whenEmpty_thenOnlyContainsUnknownWord() {
 +    assertTrue(Vocabulary.hasId(Vocabulary.UNKNOWN_ID));
 +    assertFalse(Vocabulary.hasId(1));
 +    assertFalse(Vocabulary.hasId(-1));
 +    assertEquals(Vocabulary.UNKNOWN_WORD, Vocabulary.word(Vocabulary.UNKNOWN_ID));
 +    assertEquals(1, Vocabulary.size());
 +  }
 +  
 +  @Test
 +  public void givenVocabulary_whenNewWord_thenMappingIsAdded() {
 +    final int FIRST_WORD_ID = 1;
 +    assertFalse(Vocabulary.hasId(FIRST_WORD_ID));
 +    assertEquals(FIRST_WORD_ID, Vocabulary.id(WORD1));
 +    //should return same id after second call:
 +    assertEquals(FIRST_WORD_ID, Vocabulary.id(WORD1));
 +    assertTrue(Vocabulary.hasId(FIRST_WORD_ID));
 +    assertEquals(WORD1, Vocabulary.word(FIRST_WORD_ID));
 +    assertEquals(2, Vocabulary.size());
 +  }
 +  
 +  @Test
 +  public void givenVocabulary_whenCheckingStringInBracketsOrNegativeNumber_thenIsNonTerminal() {
 +    //non-terminals
-     assertTrue(FormatUtils.isNonterminal(NON_TERMINAL));
++    assertTrue(isNonterminal(NON_TERMINAL));
 +    //terminals
-     assertFalse(FormatUtils.isNonterminal(WORD1));
-     assertFalse(FormatUtils.isNonterminal("[]"));
-     assertFalse(FormatUtils.isNonterminal("["));
-     assertFalse(FormatUtils.isNonterminal("]"));
-     assertFalse(FormatUtils.isNonterminal(""));
++    assertFalse(isNonterminal(WORD1));
++    assertFalse(isNonterminal("[]"));
++    assertFalse(isNonterminal("["));
++    assertFalse(isNonterminal("]"));
++    assertFalse(isNonterminal(""));
 +    
 +    //negative numbers indicate non-terminals
-     assertTrue(FormatUtils.isNonterminal(-1));
-     assertTrue(FormatUtils.isNonterminal(-5));
++    assertTrue(isNonterminal(-1));
++    assertTrue(isNonterminal(-5));
 +    
 +    //positive numbers indicate terminals:
-     assertFalse(FormatUtils.isNonterminal(0));
-     assertFalse(FormatUtils.isNonterminal(5));
- 
-     
++    assertFalse(isNonterminal(0));
++    assertFalse(isNonterminal(5));
 +  }
 +  
 +  @Test
 +  public void givenVocabulary_whenNonTerminal_thenReturnsStrictlyPositiveNonTerminalIndices() {
 +    final int FIRST_NON_TERMINAL_INDEX = 1;
 +    assertTrue(Vocabulary.id(NON_TERMINAL) < 0);
 +    assertTrue(Vocabulary.hasId(FIRST_NON_TERMINAL_INDEX));
 +    assertTrue(Vocabulary.hasId(-FIRST_NON_TERMINAL_INDEX));
 +    
 +    assertTrue(Vocabulary.id("") > 0);
 +    assertTrue(Vocabulary.id(WORD1) > 0);
 +    
 +    final int SECOND_NON_TERMINAL_INDEX = 4;
 +    assertTrue(Vocabulary.id(GOAL) < 0);
 +    assertTrue(Vocabulary.hasId(SECOND_NON_TERMINAL_INDEX));
 +    assertTrue(Vocabulary.hasId(-SECOND_NON_TERMINAL_INDEX));
 +    
 +    assertTrue(Vocabulary.id(WORD2) > 0);
 +  }
 +  
 +  @Rule
 +  public TemporaryFolder folder = new TemporaryFolder();
 +  
 +  @Test
 +  public void givenVocabulary_whenWritenAndReading_thenVocabularyStaysTheSame() throws IOException {
 +    File vocabFile = folder.newFile();
 +    
 +    int id1 = Vocabulary.id(WORD1);
 +    int id2 = Vocabulary.id(NON_TERMINAL);
 +    int id3 = Vocabulary.id(WORD2);
 +    
 +    Vocabulary.write(vocabFile.getAbsolutePath());
 +    
 +    Vocabulary.clear();
 +    
 +    Vocabulary.read(vocabFile);
 +    
 +    assertEquals(4, Vocabulary.size()); //unknown word + 3 other words
 +    assertTrue(Vocabulary.hasId(id1));
 +    assertTrue(Vocabulary.hasId(id2));
 +    assertTrue(Vocabulary.hasId(id3));
 +    assertEquals(id1, Vocabulary.id(WORD1));
 +    assertEquals(id2, Vocabulary.id(NON_TERMINAL));
 +    assertEquals(id3, Vocabulary.id(WORD2));
 +  }
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/src/test/java/org/apache/joshua/decoder/DecoderThreadTest.java
----------------------------------------------------------------------
diff --cc src/test/java/org/apache/joshua/decoder/DecoderThreadTest.java
index ed49c2a,0000000..326ab23
mode 100644,000000..100644
--- a/src/test/java/org/apache/joshua/decoder/DecoderThreadTest.java
+++ b/src/test/java/org/apache/joshua/decoder/DecoderThreadTest.java
@@@ -1,176 -1,0 +1,172 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 +package org.apache.joshua.decoder;
 +
 +import java.io.File;
 +import java.io.IOException;
 +import java.io.PrintStream;
 +import java.util.Date;
- import java.util.Scanner;
- 
- import org.apache.joshua.corpus.Corpus;
- import org.apache.joshua.corpus.Vocabulary;
 +
 +import org.testng.Assert;
 +import org.testng.annotations.Test;
 +
 +/**
 + * Unit tests for decoder thread.
 + * 
 + * @author Lane Schwartz
 + * @version $LastChangedDate$
 + */
 +public class DecoderThreadTest {
 +
 +  @Test
 +  public void setup() {
 +
 +    String[] sourceSentences = {
 +        "a b c d",
 +        "a b c d",
 +        "a b c d"
 +    };
 +
 +    String[] targetSentences = {
 +        "w x y z",
 +        "w t u v",
 +        "s x y z"
 +    };
 +
 +    String[] alignmentLines = {
 +        "0-0 1-1 2-2 3-3",
 +        "0-0 1-1 2-2 3-3",
 +        "0-0 1-1 2-2 3-3"
 +    };
 +
 +    String[] testSentences = {
 +        "a b c"	
 +    };
 +
 +    try {
 +
 +      // Set up source corpus
 +      File sourceFile = File.createTempFile("source", new Date().toString());
 +      PrintStream sourcePrintStream = new PrintStream(sourceFile, "UTF-8");
 +      for (String sentence : sourceSentences) {
 +        sourcePrintStream.println(sentence);
 +      }
 +      sourcePrintStream.close();
 +      String sourceCorpusFileName = sourceFile.getAbsolutePath();
 +
 +//      Vocabulary vocabulary = new Vocabulary();
 +//      int[] sourceLengths = Vocabulary.initializeVocabulary(sourceCorpusFileName, vocabulary, true);
 +//      Assert.assertEquals(sourceLengths.length, 2);
 +//      int numberOfSentences = sourceLengths[1];
 +//
 +//      Corpus sourceCorpus = SuffixArrayFactory.createCorpusArray(sourceCorpusFileName, vocabulary, sourceLengths[0], sourceLengths[1]);
 +
 +
 +      // Set up target corpus
 +      File targetFile = File.createTempFile("target", new Date().toString());
 +      PrintStream targetPrintStream = new PrintStream(targetFile, "UTF-8");
 +      for (String sentence : targetSentences) {
 +        targetPrintStream.println(sentence);
 +      }
 +      targetPrintStream.close();
 +      String targetCorpusFileName = targetFile.getAbsolutePath();
 +
 +//      int[] targetLengths = Vocabulary.initializeVocabulary(targetCorpusFileName, vocabulary, true);
 +//      Assert.assertEquals(targetLengths.length, sourceLengths.length);
 +//      for (int i=0, n=targetLengths.length; i<n; i++) {
 +//        Assert.assertEquals(targetLengths[i], sourceLengths[i]);
 +//      }
 +//
 +//      Corpus targetCorpus = SuffixArrayFactory.createCorpusArray(targetCorpusFileName, vocabulary, targetLengths[0], targetLengths[1]);
 +
 +
 +      // Construct alignments data structure
 +      File alignmentsFile = File.createTempFile("alignments", new Date().toString());
 +      PrintStream alignmentsPrintStream = new PrintStream(alignmentsFile, "UTF-8");
 +      for (String sentence : alignmentLines) {
 +        alignmentsPrintStream.println(sentence);
 +      }
 +      alignmentsPrintStream.close();
 +      String alignmentFileName = alignmentsFile.getAbsolutePath();
 +
 +//      AlignmentGrids grids = new AlignmentGrids(
 +//          new Scanner(alignmentsFile), 
 +//          sourceCorpus, 
 +//          targetCorpus, 
 +//          numberOfSentences);
 +
 +
 +      // Set up test corpus
 +      File testFile = File.createTempFile("test", new Date().toString());
 +      PrintStream testPrintStream = new PrintStream(testFile, "UTF-8");
 +      for (String sentence : testSentences) {
 +        testPrintStream.println(sentence);
 +      }
 +      testPrintStream.close();
 +      String testFileName = testFile.getAbsolutePath();
 +
 +      // Filename of the extracted rules file.
 +      String rulesFileName; {	
 +        File rulesFile = File.createTempFile("rules", new Date().toString());
 +        rulesFileName = rulesFile.getAbsolutePath();
 +      }
 +
 +      String joshDirName; {
 +        File joshDir = File.createTempFile(new Date().toString(), "josh");
 +        joshDirName = joshDir.getAbsolutePath();
 +        joshDir.delete();
 +      }
 +
 +
 +//      Compile compileJoshDir = new Compile();
 +//      compileJoshDir.setSourceCorpus(sourceCorpusFileName);
 +//      compileJoshDir.setTargetCorpus(targetCorpusFileName);
 +//      compileJoshDir.setAlignments(alignmentFileName);
 +//      compileJoshDir.setOutputDir(joshDirName);
 +//      compileJoshDir.execute();
 +//
 +//      ExtractRules extractRules = new ExtractRules();
 +//      extractRules.setJoshDir(joshDirName);
 +//      extractRules.setTestFile(testFileName);
 +//      extractRules.setOutputFile(rulesFileName);
 +//      extractRules.execute();
 +
 +    } catch (IOException e) {
 +      Assert.fail("Unable to write temporary file. " + e.toString());
 +    }
 +//    } catch (ClassNotFoundException e) {
 +//      Assert.fail("Unable to extract rules. " + e.toString());
 +//    }
 +  }
 +
 +  @Test
 +  public void basicSuffixArrayGrammar() {
 +
 +    // Write configuration to temp file on disk
 +    //		String configFile;
 +
 +
 +    //		JoshuaDecoder decoder = 
 +    //			JoshuaDecoder.getUninitalizedDecoder(configFile);
 +
 +
 +
 +  }
 +
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/src/test/java/org/apache/joshua/decoder/kbest_extraction/KBestExtractionTest.java
----------------------------------------------------------------------
diff --cc src/test/java/org/apache/joshua/decoder/kbest_extraction/KBestExtractionTest.java
index c2cb031,0000000..caeeeb3
mode 100644,000000..100644
--- a/src/test/java/org/apache/joshua/decoder/kbest_extraction/KBestExtractionTest.java
+++ b/src/test/java/org/apache/joshua/decoder/kbest_extraction/KBestExtractionTest.java
@@@ -1,80 -1,0 +1,78 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 + package org.apache.joshua.decoder.kbest_extraction;
 +
++import static com.google.common.base.Charsets.UTF_8;
++import static java.nio.file.Files.readAllBytes;
++import static org.junit.Assert.assertEquals;
++
 +import java.io.IOException;
 +import java.nio.file.Path;
 +import java.nio.file.Paths;
 +
- import org.apache.joshua.corpus.Vocabulary;
 +import org.apache.joshua.decoder.Decoder;
 +import org.apache.joshua.decoder.JoshuaConfiguration;
 +import org.apache.joshua.decoder.Translation;
 +import org.apache.joshua.decoder.segment_file.Sentence;
 +
 +import org.junit.After;
 +import org.junit.Before;
 +import org.junit.Test;
 +
- import static com.google.common.base.Charsets.UTF_8;
- import static java.nio.file.Files.readAllBytes;
- import static org.apache.joshua.decoder.ff.FeatureVector.DENSE_FEATURE_NAMES;
- import static org.junit.Assert.assertEquals;
- 
 +/**
 + * Reimplements the kbest extraction regression test
 + * TODO (fhieber): this test strangely only works with StateMinimizing KenLM.
 + * This is to be investigated
 + */
 +public class KBestExtractionTest {
 +  
 +  private static final String CONFIG = "resources/kbest_extraction/joshua.config";
 +  private static final String INPUT = "a b c d e";
 +  private static final Path GOLD_PATH = Paths.get("resources/kbest_extraction/output.scores.gold");
 +  
 +  private JoshuaConfiguration joshuaConfig = null;
 +  private Decoder decoder = null;
 +  
 +  @Before
 +  public void setUp() throws Exception {
 +    joshuaConfig = new JoshuaConfiguration();
 +    joshuaConfig.readConfigFile(CONFIG);
 +    joshuaConfig.outputFormat = "%i ||| %s ||| %c";
 +    decoder = new Decoder(joshuaConfig, "");
 +  }
 +  
 +  @After
 +  public void tearDown() throws Exception {
 +    decoder.cleanUp();
 +    decoder = null;
 +  }
 +  
 +  @Test
 +  public void givenInput_whenKbestExtraction_thenOutputIsAsExpected() throws IOException {
 +    final String translation = decode(INPUT).toString();
 +    final String gold = new String(readAllBytes(GOLD_PATH), UTF_8);
 +    assertEquals(gold, translation);
 +  }
 +  
 +  private Translation decode(String input) {
 +    final Sentence sentence = new Sentence(input, 0, joshuaConfig);
 +    return decoder.decode(sentence);
 +  }
 +
 +}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/91400fe2/src/test/java/org/apache/joshua/decoder/phrase/constrained/ConstrainedPhraseDecodingTest.java
----------------------------------------------------------------------
diff --cc src/test/java/org/apache/joshua/decoder/phrase/constrained/ConstrainedPhraseDecodingTest.java
index 4612b44,0000000..04078c6
mode 100644,000000..100644
--- a/src/test/java/org/apache/joshua/decoder/phrase/constrained/ConstrainedPhraseDecodingTest.java
+++ b/src/test/java/org/apache/joshua/decoder/phrase/constrained/ConstrainedPhraseDecodingTest.java
@@@ -1,77 -1,0 +1,75 @@@
 +/*
 + * Licensed to the Apache Software Foundation (ASF) under one
 + * or more contributor license agreements.  See the NOTICE file
 + * distributed with this work for additional information
 + * regarding copyright ownership.  The ASF licenses this file
 + * to you under the Apache License, Version 2.0 (the
 + * "License"); you may not use this file except in compliance
 + * with the License.  You may obtain a copy of the License at
 + *
 + *  http://www.apache.org/licenses/LICENSE-2.0
 + *
 + * Unless required by applicable law or agreed to in writing,
 + * software distributed under the License is distributed on an
 + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 + * KIND, either express or implied.  See the License for the
 + * specific language governing permissions and limitations
 + * under the License.
 + */
 + package org.apache.joshua.decoder.phrase.constrained;
 +
++import static com.google.common.base.Charsets.UTF_8;
++import static java.nio.file.Files.readAllBytes;
++import static org.junit.Assert.assertEquals;
++
 +import java.io.IOException;
 +import java.nio.file.Path;
 +import java.nio.file.Paths;
 +
- import org.apache.joshua.corpus.Vocabulary;
 +import org.apache.joshua.decoder.Decoder;
 +import org.apache.joshua.decoder.JoshuaConfiguration;
 +import org.apache.joshua.decoder.Translation;
 +import org.apache.joshua.decoder.segment_file.Sentence;
 +
 +import org.junit.After;
 +import org.junit.Before;
 +import org.junit.Test;
 +
- import static com.google.common.base.Charsets.UTF_8;
- import static java.nio.file.Files.readAllBytes;
- import static org.apache.joshua.decoder.ff.FeatureVector.DENSE_FEATURE_NAMES;
- import static org.junit.Assert.assertEquals;
- 
 +/**
 + * Reimplements the constrained phrase decoding test
 + */
 +public class ConstrainedPhraseDecodingTest {
 +  
 +  private static final String CONFIG = "resources/phrase_decoder/constrained.config";
 +  private static final String INPUT = "una estrategia republicana para obstaculizar la reelecci�n de Obama ||| President Obama to hinder a strategy for Republican re @-@ election";
 +  private static final Path GOLD_PATH = Paths.get("resources/phrase_decoder/constrained.output.gold");
 +  
 +  private JoshuaConfiguration joshuaConfig = null;
 +  private Decoder decoder = null;
 +  
 +  @Before
 +  public void setUp() throws Exception {
 +    joshuaConfig = new JoshuaConfiguration();
 +    joshuaConfig.readConfigFile(CONFIG);
 +    decoder = new Decoder(joshuaConfig, "");
 +  }
 +  
 +  @After
 +  public void tearDown() throws Exception {
 +    decoder.cleanUp();
 +    decoder = null;
 +  }
 +  
 +  @Test
 +  public void givenInput_whenConstrainedPhraseDecoding_thenOutputIsAsExpected() throws IOException {
 +    final String translation = decode(INPUT).toString();
 +    final String gold = new String(readAllBytes(GOLD_PATH), UTF_8);
 +    assertEquals(gold, translation);
 +  }
 +  
 +  private Translation decode(String input) {
 +    final Sentence sentence = new Sentence(input, 0, joshuaConfig);
 +    return decoder.decode(sentence);
 +  }
 +
 +}