You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/06/23 18:45:36 UTC
[25/60] [partial] incubator-joshua git commit: maven multi-module layout 1st commit: moving files into joshua-core

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/tools/GrammarPacker.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/tools/GrammarPacker.java b/joshua-core/src/main/java/org/apache/joshua/tools/GrammarPacker.java
new file mode 100644
index 0000000..b9208d2
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/tools/GrammarPacker.java
@@ -0,0 +1,959 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.tools;
+
+import static org.apache.joshua.decoder.ff.tm.packed.PackedGrammar.VOCABULARY_FILENAME;
+
+import java.io.BufferedOutputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Queue;
+import java.util.TreeMap;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.ff.tm.format.HieroFormatReader;
+import org.apache.joshua.decoder.ff.tm.format.MosesFormatReader;
+import org.apache.joshua.util.FormatUtils;
+import org.apache.joshua.util.encoding.EncoderConfiguration;
+import org.apache.joshua.util.encoding.FeatureTypeAnalyzer;
+import org.apache.joshua.util.encoding.IntEncoder;
+import org.apache.joshua.util.io.LineReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+public class GrammarPacker {
+
+  private static final Logger LOG = LoggerFactory.getLogger(GrammarPacker.class);
+
+  /**
+   * The packed grammar version number. Increment this any time you add new features, and update
+   * the documentation.
+   * 
+   * Version history:
+   * 
+   * - 3 (May 2016). This was the first version that was marked. It removed the special phrase-
+   * table packing that packed phrases without the [X,1] on the source and target sides, which
+   * then required special handling in the decoder to use for phrase-based decoding.
+   * 
+   * 
+   */
+  public static final int VERSION = 3;
+  
+  // Size limit for slice in bytes.
+  private static int DATA_SIZE_LIMIT = (int) (Integer.MAX_VALUE * 0.8);
+  // Estimated average number of feature entries for one rule.
+  private static int DATA_SIZE_ESTIMATE = 20;
+
+  private static final String SOURCE_WORDS_SEPARATOR = " ||| ";
+
+  // Output directory name.
+  private String output;
+
+  // Input grammar to be packed.
+  private String grammar;
+
+  public String getGrammar() {
+    return grammar;
+  }
+
+  public String getOutputDirectory() {
+    return output;
+  }
+
+  // Approximate maximum size of a slice in number of rules
+  private int approximateMaximumSliceSize;
+
+  private boolean labeled;
+
+  private boolean packAlignments;
+  private boolean grammarAlignments;
+  private String alignments;
+
+  private FeatureTypeAnalyzer types;
+  private EncoderConfiguration encoderConfig;
+
+  private String dump;
+
+  private int max_source_len;
+
+  public GrammarPacker(String grammar_filename, String config_filename, String output_filename,
+      String alignments_filename, String featuredump_filename, boolean grammar_alignments,
+      int approximateMaximumSliceSize)
+      throws IOException {
+    this.labeled = true;
+    this.grammar = grammar_filename;
+    this.output = output_filename;
+    this.dump = featuredump_filename;
+    this.grammarAlignments = grammar_alignments;
+    this.approximateMaximumSliceSize = approximateMaximumSliceSize;
+    this.max_source_len = 0;
+
+    // TODO: Always open encoder config? This is debatable.
+    this.types = new FeatureTypeAnalyzer(true);
+
+    this.alignments = alignments_filename;
+    packAlignments = grammarAlignments || (alignments != null);
+    if (!packAlignments) {
+      LOG.info("No alignments file or grammar specified, skipping.");
+    } else if (alignments != null && !new File(alignments_filename).exists()) {
+      throw new RuntimeException("Alignments file does not exist: " + alignments);
+    }
+
+    if (config_filename != null) {
+      readConfig(config_filename);
+      types.readConfig(config_filename);
+    } else {
+      LOG.info("No config specified. Attempting auto-detection of feature types.");
+    }
+    LOG.info("Approximate maximum slice size (in # of rules) set to {}", approximateMaximumSliceSize);
+
+    File working_dir = new File(output);
+    working_dir.mkdir();
+    if (!working_dir.exists()) {
+      throw new RuntimeException("Failed creating output directory.");
+    }
+  }
+
+  private void readConfig(String config_filename) throws IOException {
+    LineReader reader = new LineReader(config_filename);
+    while (reader.hasNext()) {
+      // Clean up line, chop comments off and skip if the result is empty.
+      String line = reader.next().trim();
+      if (line.indexOf('#') != -1)
+        line = line.substring(0, line.indexOf('#'));
+      if (line.isEmpty())
+        continue;
+      String[] fields = line.split("[\\s]+");
+
+      if (fields.length < 2) {
+        throw new RuntimeException("Incomplete line in config.");
+      }
+      if ("slice_size".equals(fields[0])) {
+        // Number of records to concurrently load into memory for sorting.
+        approximateMaximumSliceSize = Integer.parseInt(fields[1]);
+      }
+    }
+    reader.close();
+  }
+
+  /**
+   * Executes the packing.
+   * 
+   * @throws IOException if there is an error reading the grammar
+   */
+  public void pack() throws IOException {
+    LOG.info("Beginning exploration pass.");
+
+    // Explore pass. Learn vocabulary and feature value histograms.
+    LOG.info("Exploring: {}", grammar);
+
+    HieroFormatReader grammarReader = getGrammarReader();
+    explore(grammarReader);
+
+    LOG.info("Exploration pass complete. Freezing vocabulary and finalizing encoders.");
+    if (dump != null) {
+      PrintWriter dump_writer = new PrintWriter(dump);
+      dump_writer.println(types.toString());
+      dump_writer.close();
+    }
+
+    types.inferTypes(this.labeled);
+    LOG.info("Type inference complete.");
+
+    LOG.info("Finalizing encoding.");
+
+    LOG.info("Writing encoding.");
+    types.write(output + File.separator + "encoding");
+
+    writeVocabulary();
+
+    String configFile = output + File.separator + "config";
+    LOG.info("Writing config to '{}'", configFile);
+    // Write config options
+    FileWriter config = new FileWriter(configFile);
+    config.write(String.format("version = %d\n", VERSION));
+    config.write(String.format("max-source-len = %d\n", max_source_len));
+    config.close();
+
+    // Read previously written encoder configuration to match up to changed
+    // vocabulary id's.
+    LOG.info("Reading encoding.");
+    encoderConfig = new EncoderConfiguration();
+    encoderConfig.load(output + File.separator + "encoding");
+
+    LOG.info("Beginning packing pass.");
+    // Actual binarization pass. Slice and pack source, target and data.
+    grammarReader = getGrammarReader();
+    LineReader alignment_reader = null;
+    if (packAlignments && !grammarAlignments)
+      alignment_reader = new LineReader(alignments);
+    binarize(grammarReader, alignment_reader);
+    LOG.info("Packing complete.");
+
+    LOG.info("Packed grammar in: {}", output);
+    LOG.info("Done.");
+  }
+
+  /**
+   * Returns a reader that turns whatever file format is found into Hiero grammar rules.
+   * 
+   * @param grammarFile
+   * @return
+   * @throws IOException
+   */
+  private HieroFormatReader getGrammarReader() throws IOException {
+    LineReader reader = new LineReader(grammar);
+    String line = reader.next();
+    if (line.startsWith("[")) {
+      return new HieroFormatReader(grammar);
+    } else {
+      return new MosesFormatReader(grammar);
+    }
+  }
+
+  /**
+   * This first pass over the grammar 
+   * @param reader
+   */
+  private void explore(HieroFormatReader reader) {
+
+    // We always assume a labeled grammar. Unlabeled features are assumed to be dense and to always
+    // appear in the same order. They are assigned numeric names in order of appearance.
+    this.types.setLabeled(true);
+
+    for (Rule rule: reader) {
+
+      max_source_len = Math.max(max_source_len, rule.getFrench().length);
+
+      /* Add symbols to vocabulary.
+       * NOTE: In case of nonterminals, we add both stripped versions ("[X]")
+       * and "[X,1]" to the vocabulary.
+       * 
+       * TODO: MJP May 2016: Is it necessary to add [X,1]? This is currently being done in
+       * {@link HieroFormatReader}, which is called by {@link MosesFormatReader}. 
+       */
+
+      // Add feature names to vocabulary and pass the value through the
+      // appropriate encoder.
+      int feature_counter = 0;
+      String[] features = rule.getFeatureString().split("\\s+");
+      for (int f = 0; f < features.length; ++f) {
+        if (features[f].contains("=")) {
+          String[] fe = features[f].split("=");
+          if (fe[0].equals("Alignment"))
+            continue;
+          types.observe(Vocabulary.id(fe[0]), Float.parseFloat(fe[1]));
+        } else {
+          types.observe(Vocabulary.id(String.valueOf(feature_counter++)),
+              Float.parseFloat(features[f]));
+        }
+      }
+    }
+  }
+
+  /**
+   * Returns a String encoding the first two source words.
+   * If there is only one source word, use empty string for the second.
+   */
+  private String getFirstTwoSourceWords(final String[] source_words) {
+    return source_words[0] + SOURCE_WORDS_SEPARATOR + ((source_words.length > 1) ? source_words[1] : "");
+  }
+
+  private void binarize(HieroFormatReader grammarReader, LineReader alignment_reader) throws IOException {
+    int counter = 0;
+    int slice_counter = 0;
+    int num_slices = 0;
+
+    boolean ready_to_flush = false;
+    // to determine when flushing is possible
+    String prev_first_two_source_words = null;
+
+    PackingTrie<SourceValue> source_trie = new PackingTrie<SourceValue>();
+    PackingTrie<TargetValue> target_trie = new PackingTrie<TargetValue>();
+    FeatureBuffer feature_buffer = new FeatureBuffer();
+
+    AlignmentBuffer alignment_buffer = null;
+    if (packAlignments)
+      alignment_buffer = new AlignmentBuffer();
+
+    TreeMap<Integer, Float> features = new TreeMap<Integer, Float>();
+    for (Rule rule: grammarReader) {
+      counter++;
+      slice_counter++;
+
+      String lhs_word = Vocabulary.word(rule.getLHS());
+      String[] source_words = rule.getFrenchWords().split("\\s+");
+      String[] target_words = rule.getEnglishWords().split("\\s+");
+      String[] feature_entries = rule.getFeatureString().split("\\s+");
+
+      // Reached slice limit size, indicate that we're closing up.
+      if (!ready_to_flush
+          && (slice_counter > approximateMaximumSliceSize
+              || feature_buffer.overflowing()
+              || (packAlignments && alignment_buffer.overflowing()))) {
+        ready_to_flush = true;
+        // store the first two source words when slice size limit was reached
+        prev_first_two_source_words = getFirstTwoSourceWords(source_words);
+      }
+      // ready to flush
+      if (ready_to_flush) {
+        final String first_two_source_words = getFirstTwoSourceWords(source_words);
+        // the grammar can only be partitioned at the level of first two source word changes.
+        // Thus, we can only flush if the current first two source words differ from the ones
+        // when the slice size limit was reached.
+        if (!first_two_source_words.equals(prev_first_two_source_words)) {
+          LOG.warn("ready to flush and first two words have changed ({} vs. {})",
+              prev_first_two_source_words, first_two_source_words);
+          LOG.info("flushing {} rules to slice.", slice_counter);
+          flush(source_trie, target_trie, feature_buffer, alignment_buffer, num_slices);
+          source_trie.clear();
+          target_trie.clear();
+          feature_buffer.clear();
+          if (packAlignments)
+            alignment_buffer.clear();
+
+          num_slices++;
+          slice_counter = 0;
+          ready_to_flush = false;
+        }
+      }
+
+      int alignment_index = -1;
+      // If present, process alignments.
+      if (packAlignments) {
+        String alignment_line;
+        if (grammarAlignments) {
+          alignment_line = rule.getAlignmentString();
+        } else {
+          if (!alignment_reader.hasNext()) {
+            LOG.error("No more alignments starting in line {}", counter);
+            throw new RuntimeException("No more alignments starting in line " + counter);
+          }
+          alignment_line = alignment_reader.next().trim();
+        }
+        String[] alignment_entries = alignment_line.split("\\s");
+        byte[] alignments = new byte[alignment_entries.length * 2];
+        if (alignment_line.length() > 0) {
+          for (int i = 0; i < alignment_entries.length; i++) {
+            String[] parts = alignment_entries[i].split("-");
+            alignments[2 * i] = Byte.parseByte(parts[0]);
+            alignments[2 * i + 1] = Byte.parseByte(parts[1]);
+          }
+        }
+        alignment_index = alignment_buffer.add(alignments);
+      }
+
+      // Process features.
+      // Implicitly sort via TreeMap, write to data buffer, remember position
+      // to pass on to the source trie node.
+      features.clear();
+      int feature_count = 0;
+      for (int f = 0; f < feature_entries.length; ++f) {
+        String feature_entry = feature_entries[f];
+        int feature_id;
+        float feature_value;
+        if (feature_entry.contains("=")) {
+          String[] parts = feature_entry.split("=");
+          if (parts[0].equals("Alignment"))
+            continue;
+          feature_id = Vocabulary.id(parts[0]);
+          feature_value = Float.parseFloat(parts[1]);
+        } else {
+          feature_id = Vocabulary.id(String.valueOf(feature_count++));
+          feature_value = Float.parseFloat(feature_entry);
+        }
+        if (feature_value != 0)
+          features.put(encoderConfig.innerId(feature_id), feature_value);
+      }
+      int features_index = feature_buffer.add(features);
+
+      // Sanity check on the data block index.
+      if (packAlignments && features_index != alignment_index) {
+        LOG.error("Block index mismatch between features ({}) and alignments ({}).",
+            features_index, alignment_index);
+        throw new RuntimeException("Data block index mismatch.");
+      }
+
+      // Process source side.
+      SourceValue sv = new SourceValue(Vocabulary.id(lhs_word), features_index);
+      int[] source = new int[source_words.length];
+      for (int i = 0; i < source_words.length; i++) {
+        if (FormatUtils.isNonterminal(source_words[i]))
+          source[i] = Vocabulary.id(FormatUtils.stripNonTerminalIndex(source_words[i]));
+        else
+          source[i] = Vocabulary.id(source_words[i]);
+      }
+      source_trie.add(source, sv);
+
+      // Process target side.
+      TargetValue tv = new TargetValue(sv);
+      int[] target = new int[target_words.length];
+      for (int i = 0; i < target_words.length; i++) {
+        if (FormatUtils.isNonterminal(target_words[i])) {
+          target[target_words.length - (i + 1)] = -FormatUtils.getNonterminalIndex(target_words[i]);
+        } else {
+          target[target_words.length - (i + 1)] = Vocabulary.id(target_words[i]);
+        }
+      }
+      target_trie.add(target, tv);
+    }
+    // flush last slice and clear buffers
+    flush(source_trie, target_trie, feature_buffer, alignment_buffer, num_slices);
+  }
+
+  /**
+   * Serializes the source, target and feature data structures into interlinked binary files. Target
+   * is written first, into a skeletal (node don't carry any data) upward-pointing trie, updating
+   * the linking source trie nodes with the position once it is known. Source and feature data are
+   * written simultaneously. The source structure is written into a downward-pointing trie and
+   * stores the rule's lhs as well as links to the target and feature stream. The feature stream is
+   * prompted to write out a block
+   * 
+   * @param source_trie
+   * @param target_trie
+   * @param feature_buffer
+   * @param id
+   * @throws IOException
+   */
+  private void flush(PackingTrie<SourceValue> source_trie,
+      PackingTrie<TargetValue> target_trie, FeatureBuffer feature_buffer,
+      AlignmentBuffer alignment_buffer, int id) throws IOException {
+    // Make a slice object for this piece of the grammar.
+    PackingFileTuple slice = new PackingFileTuple("slice_" + String.format("%05d", id));
+    // Pull out the streams for source, target and data output.
+    DataOutputStream source_stream = slice.getSourceOutput();
+    DataOutputStream target_stream = slice.getTargetOutput();
+    DataOutputStream target_lookup_stream = slice.getTargetLookupOutput();
+    DataOutputStream feature_stream = slice.getFeatureOutput();
+    DataOutputStream alignment_stream = slice.getAlignmentOutput();
+
+    Queue<PackingTrie<TargetValue>> target_queue;
+    Queue<PackingTrie<SourceValue>> source_queue;
+
+    // The number of bytes both written into the source stream and
+    // buffered in the source queue.
+    int source_position;
+    // The number of bytes written into the target stream.
+    int target_position;
+
+    // Add trie root into queue, set target position to 0 and set cumulated
+    // size to size of trie root.
+    target_queue = new LinkedList<PackingTrie<TargetValue>>();
+    target_queue.add(target_trie);
+    target_position = 0;
+
+    // Target lookup table for trie levels.
+    int current_level_size = 1;
+    int next_level_size = 0;
+    ArrayList<Integer> target_lookup = new ArrayList<Integer>();
+
+    // Packing loop for upwards-pointing target trie.
+    while (!target_queue.isEmpty()) {
+      // Pop top of queue.
+      PackingTrie<TargetValue> node = target_queue.poll();
+      // Register that this is where we're writing the node to.
+      node.address = target_position;
+      // Tell source nodes that we're writing to this position in the file.
+      for (TargetValue tv : node.values)
+        tv.parent.target = node.address;
+      // Write link to parent.
+      if (node.parent != null)
+        target_stream.writeInt(node.parent.address);
+      else
+        target_stream.writeInt(-1);
+      target_stream.writeInt(node.symbol);
+      // Enqueue children.
+      for (int k : node.children.descendingKeySet()) {
+        PackingTrie<TargetValue> child = node.children.get(k);
+        target_queue.add(child);
+      }
+      target_position += node.size(false, true);
+      next_level_size += node.children.descendingKeySet().size();
+
+      current_level_size--;
+      if (current_level_size == 0) {
+        target_lookup.add(target_position);
+        current_level_size = next_level_size;
+        next_level_size = 0;
+      }
+    }
+    target_lookup_stream.writeInt(target_lookup.size());
+    for (int i : target_lookup)
+      target_lookup_stream.writeInt(i);
+    target_lookup_stream.close();
+
+    // Setting up for source and data writing.
+    source_queue = new LinkedList<PackingTrie<SourceValue>>();
+    source_queue.add(source_trie);
+    source_position = source_trie.size(true, false);
+    source_trie.address = target_position;
+
+    // Ready data buffers for writing.
+    feature_buffer.initialize();
+    if (packAlignments)
+      alignment_buffer.initialize();
+
+    // Packing loop for downwards-pointing source trie.
+    while (!source_queue.isEmpty()) {
+      // Pop top of queue.
+      PackingTrie<SourceValue> node = source_queue.poll();
+      // Write number of children.
+      source_stream.writeInt(node.children.size());
+      // Write links to children.
+      for (int k : node.children.descendingKeySet()) {
+        PackingTrie<SourceValue> child = node.children.get(k);
+        // Enqueue child.
+        source_queue.add(child);
+        // Child's address will be at the current end of the queue.
+        child.address = source_position;
+        // Advance cumulated size by child's size.
+        source_position += child.size(true, false);
+        // Write the link.
+        source_stream.writeInt(k);
+        source_stream.writeInt(child.address);
+      }
+      // Write number of data items.
+      source_stream.writeInt(node.values.size());
+      // Write lhs and links to target and data.
+      for (SourceValue sv : node.values) {
+        int feature_block_index = feature_buffer.write(sv.data);
+        if (packAlignments) {
+          int alignment_block_index = alignment_buffer.write(sv.data);
+          if (alignment_block_index != feature_block_index) {
+            LOG.error("Block index mismatch.");
+            throw new RuntimeException("Block index mismatch: alignment (" + alignment_block_index
+                + ") and features (" + feature_block_index + ") don't match.");
+          }
+        }
+        source_stream.writeInt(sv.lhs);
+        source_stream.writeInt(sv.target);
+        source_stream.writeInt(feature_block_index);
+      }
+    }
+    // Flush the data stream.
+    feature_buffer.flush(feature_stream);
+    if (packAlignments)
+      alignment_buffer.flush(alignment_stream);
+
+    target_stream.close();
+    source_stream.close();
+    feature_stream.close();
+    if (packAlignments)
+      alignment_stream.close();
+  }
+
+  public void writeVocabulary() throws IOException {
+    final String vocabularyFilename = output + File.separator + VOCABULARY_FILENAME;
+    LOG.info("Writing vocabulary to {}", vocabularyFilename);
+    Vocabulary.write(vocabularyFilename);
+  }
+
+  /**
+   * Integer-labeled, doubly-linked trie with some provisions for packing.
+   * 
+   * @author Juri Ganitkevitch
+   * 
+   * @param <D> The trie's value type.
+   */
+  class PackingTrie<D extends PackingTrieValue> {
+    int symbol;
+    PackingTrie<D> parent;
+
+    TreeMap<Integer, PackingTrie<D>> children;
+    List<D> values;
+
+    int address;
+
+    PackingTrie() {
+      address = -1;
+
+      symbol = 0;
+      parent = null;
+
+      children = new TreeMap<Integer, PackingTrie<D>>();
+      values = new ArrayList<D>();
+    }
+
+    PackingTrie(PackingTrie<D> parent, int symbol) {
+      this();
+      this.parent = parent;
+      this.symbol = symbol;
+    }
+
+    void add(int[] path, D value) {
+      add(path, 0, value);
+    }
+
+    private void add(int[] path, int index, D value) {
+      if (index == path.length)
+        this.values.add(value);
+      else {
+        PackingTrie<D> child = children.get(path[index]);
+        if (child == null) {
+          child = new PackingTrie<D>(this, path[index]);
+          children.put(path[index], child);
+        }
+        child.add(path, index + 1, value);
+      }
+    }
+
+    /**
+     * Calculate the size (in ints) of a packed trie node. Distinguishes downwards pointing (parent
+     * points to children) from upwards pointing (children point to parent) tries, as well as
+     * skeletal (no data, just the labeled links) and non-skeletal (nodes have a data block)
+     * packing.
+     * 
+     * @param downwards Are we packing into a downwards-pointing trie?
+     * @param skeletal Are we packing into a skeletal trie?
+     * 
+     * @return Number of bytes the trie node would occupy.
+     */
+    int size(boolean downwards, boolean skeletal) {
+      int size = 0;
+      if (downwards) {
+        // Number of children and links to children.
+        size = 1 + 2 * children.size();
+      } else {
+        // Link to parent.
+        size += 2;
+      }
+      // Non-skeletal packing: number of data items.
+      if (!skeletal)
+        size += 1;
+      // Non-skeletal packing: write size taken up by data items.
+      if (!skeletal && !values.isEmpty())
+        size += values.size() * values.get(0).size();
+
+      return size;
+    }
+
+    void clear() {
+      children.clear();
+      values.clear();
+    }
+  }
+
+  interface PackingTrieValue {
+    int size();
+  }
+
+  class SourceValue implements PackingTrieValue {
+    int lhs;
+    int data;
+    int target;
+
+    public SourceValue() {
+    }
+
+    SourceValue(int lhs, int data) {
+      this.lhs = lhs;
+      this.data = data;
+    }
+
+    void setTarget(int target) {
+      this.target = target;
+    }
+
+    public int size() {
+      return 3;
+    }
+  }
+
+  class TargetValue implements PackingTrieValue {
+    SourceValue parent;
+
+    TargetValue(SourceValue parent) {
+      this.parent = parent;
+    }
+
+    public int size() {
+      return 0;
+    }
+  }
+
+  abstract class PackingBuffer<T> {
+    private byte[] backing;
+    protected ByteBuffer buffer;
+
+    protected ArrayList<Integer> memoryLookup;
+    protected int totalSize;
+    protected ArrayList<Integer> onDiskOrder;
+
+    PackingBuffer() throws IOException {
+      allocate();
+      memoryLookup = new ArrayList<Integer>();
+      onDiskOrder = new ArrayList<Integer>();
+      totalSize = 0;
+    }
+
+    abstract int add(T item);
+
+    // Allocate a reasonably-sized buffer for the feature data.
+    private void allocate() {
+      backing = new byte[approximateMaximumSliceSize * DATA_SIZE_ESTIMATE];
+      buffer = ByteBuffer.wrap(backing);
+    }
+
+    // Reallocate the backing array and buffer, copies data over.
+    protected void reallocate() {
+      if (backing.length == Integer.MAX_VALUE)
+        return;
+      long attempted_length = backing.length * 2l;
+      int new_length;
+      // Detect overflow.
+      if (attempted_length >= Integer.MAX_VALUE)
+        new_length = Integer.MAX_VALUE;
+      else
+        new_length = (int) attempted_length;
+      byte[] new_backing = new byte[new_length];
+      System.arraycopy(backing, 0, new_backing, 0, backing.length);
+      int old_position = buffer.position();
+      ByteBuffer new_buffer = ByteBuffer.wrap(new_backing);
+      new_buffer.position(old_position);
+      buffer = new_buffer;
+      backing = new_backing;
+    }
+
+    /**
+     * Prepare the data buffer for disk writing.
+     */
+    void initialize() {
+      onDiskOrder.clear();
+    }
+
+    /**
+     * Enqueue a data block for later writing.
+     * 
+     * @param block_index The index of the data block to add to writing queue.
+     * @return The to-be-written block's output index.
+     */
+    int write(int block_index) {
+      onDiskOrder.add(block_index);
+      return onDiskOrder.size() - 1;
+    }
+
+    /**
+     * Performs the actual writing to disk in the order specified by calls to write() since the last
+     * call to initialize().
+     * 
+     * @param out
+     * @throws IOException
+     */
+    void flush(DataOutputStream out) throws IOException {
+      writeHeader(out);
+      int size;
+      int block_address;
+      for (int block_index : onDiskOrder) {
+        block_address = memoryLookup.get(block_index);
+        size = blockSize(block_index);
+        out.write(backing, block_address, size);
+      }
+    }
+
+    void clear() {
+      buffer.clear();
+      memoryLookup.clear();
+      onDiskOrder.clear();
+    }
+
+    boolean overflowing() {
+      return (buffer.position() >= DATA_SIZE_LIMIT);
+    }
+
+    private void writeHeader(DataOutputStream out) throws IOException {
+      if (out.size() == 0) {
+        out.writeInt(onDiskOrder.size());
+        out.writeInt(totalSize);
+        int disk_position = headerSize();
+        for (int block_index : onDiskOrder) {
+          out.writeInt(disk_position);
+          disk_position += blockSize(block_index);
+        }
+      } else {
+        throw new RuntimeException("Got a used stream for header writing.");
+      }
+    }
+
+    private int headerSize() {
+      // One integer for each data block, plus number of blocks and total size.
+      return 4 * (onDiskOrder.size() + 2);
+    }
+
+    private int blockSize(int block_index) {
+      int block_address = memoryLookup.get(block_index);
+      return (block_index < memoryLookup.size() - 1 ? memoryLookup.get(block_index + 1) : totalSize)
+          - block_address;
+    }
+  }
+
+  class FeatureBuffer extends PackingBuffer<TreeMap<Integer, Float>> {
+
+    private IntEncoder idEncoder;
+
+    FeatureBuffer() throws IOException {
+      super();
+      idEncoder = types.getIdEncoder();
+      LOG.info("Encoding feature ids in: {}", idEncoder.getKey());
+    }
+
+    /**
+     * Add a block of features to the buffer.
+     * 
+     * @param features TreeMap with the features for one rule.
+     * @return The index of the resulting data block.
+     */
+    int add(TreeMap<Integer, Float> features) {
+      int data_position = buffer.position();
+
+      // Over-estimate how much room this addition will need: for each
+      // feature (ID_SIZE for label, "upper bound" of 4 for the value), plus ID_SIZE for
+      // the number of features. If this won't fit, reallocate the buffer.
+      int size_estimate = (4 + EncoderConfiguration.ID_SIZE) * features.size()
+          + EncoderConfiguration.ID_SIZE;
+      if (buffer.capacity() - buffer.position() <= size_estimate)
+        reallocate();
+
+      // Write features to buffer.
+      idEncoder.write(buffer, features.size());
+      for (Integer k : features.descendingKeySet()) {
+        float v = features.get(k);
+        // Sparse features.
+        if (v != 0.0) {
+          idEncoder.write(buffer, k);
+          encoderConfig.encoder(k).write(buffer, v);
+        }
+      }
+      // Store position the block was written to.
+      memoryLookup.add(data_position);
+      // Update total size (in bytes).
+      totalSize = buffer.position();
+
+      // Return block index.
+      return memoryLookup.size() - 1;
+    }
+  }
+
+  class AlignmentBuffer extends PackingBuffer<byte[]> {
+
+    AlignmentBuffer() throws IOException {
+      super();
+    }
+
+    /**
+     * Add a rule alignments to the buffer.
+     * 
+     * @param alignments a byte array with the alignment points for one rule.
+     * @return The index of the resulting data block.
+     */
+    int add(byte[] alignments) {
+      int data_position = buffer.position();
+      int size_estimate = alignments.length + 1;
+      if (buffer.capacity() - buffer.position() <= size_estimate)
+        reallocate();
+
+      // Write alignment points to buffer.
+      buffer.put((byte) (alignments.length / 2));
+      buffer.put(alignments);
+
+      // Store position the block was written to.
+      memoryLookup.add(data_position);
+      // Update total size (in bytes).
+      totalSize = buffer.position();
+      // Return block index.
+      return memoryLookup.size() - 1;
+    }
+  }
+
+  class PackingFileTuple implements Comparable<PackingFileTuple> {
+    private File sourceFile;
+    private File targetLookupFile;
+    private File targetFile;
+
+    private File featureFile;
+    private File alignmentFile;
+
+    PackingFileTuple(String prefix) {
+      sourceFile = new File(output + File.separator + prefix + ".source");
+      targetFile = new File(output + File.separator + prefix + ".target");
+      targetLookupFile = new File(output + File.separator + prefix + ".target.lookup");
+      featureFile = new File(output + File.separator + prefix + ".features");
+
+      alignmentFile = null;
+      if (packAlignments)
+        alignmentFile = new File(output + File.separator + prefix + ".alignments");
+
+      LOG.info("Allocated slice: {}", sourceFile.getAbsolutePath());
+    }
+
+    DataOutputStream getSourceOutput() throws IOException {
+      return getOutput(sourceFile);
+    }
+
+    DataOutputStream getTargetOutput() throws IOException {
+      return getOutput(targetFile);
+    }
+
+    DataOutputStream getTargetLookupOutput() throws IOException {
+      return getOutput(targetLookupFile);
+    }
+
+    DataOutputStream getFeatureOutput() throws IOException {
+      return getOutput(featureFile);
+    }
+
+    DataOutputStream getAlignmentOutput() throws IOException {
+      if (alignmentFile != null)
+        return getOutput(alignmentFile);
+      return null;
+    }
+
+    private DataOutputStream getOutput(File file) throws IOException {
+      if (file.createNewFile()) {
+        return new DataOutputStream(new BufferedOutputStream(new FileOutputStream(file)));
+      } else {
+        throw new RuntimeException("File doesn't exist: " + file.getName());
+      }
+    }
+
+    long getSize() {
+      return sourceFile.length() + targetFile.length() + featureFile.length();
+    }
+
+    @Override
+    public int compareTo(PackingFileTuple o) {
+      if (getSize() > o.getSize()) {
+        return -1;
+      } else if (getSize() < o.getSize()) {
+        return 1;
+      } else {
+        return 0;
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/tools/GrammarPackerCli.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/tools/GrammarPackerCli.java b/joshua-core/src/main/java/org/apache/joshua/tools/GrammarPackerCli.java
new file mode 100644
index 0000000..3cd4d0c
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/tools/GrammarPackerCli.java
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.tools;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.kohsuke.args4j.CmdLineException;
+import org.kohsuke.args4j.CmdLineParser;
+import org.kohsuke.args4j.Option;
+import org.kohsuke.args4j.spi.StringArrayOptionHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class GrammarPackerCli {
+  
+  private static final Logger LOG = LoggerFactory.getLogger(GrammarPackerCli.class);
+  
+  // Input grammars to be packed (with a joint vocabulary)
+  @Option(name = "--grammars", aliases = {"-g", "-i"}, handler = StringArrayOptionHandler.class, required = true, usage = "list of grammars to pack (jointly, i.e. they share the same vocabulary)")
+  private List<String> grammars = new ArrayList<>();
+  
+  // Output grammars
+  @Option(name = "--outputs", aliases = {"-p", "-o"}, handler = StringArrayOptionHandler.class, required = true, usage = "output directories of packed grammars.")
+  private List<String> outputs = new ArrayList<>();
+  
+  // Output grammars
+  @Option(name = "--alignments", aliases = {"-a", "--fa"}, handler = StringArrayOptionHandler.class, required = false, usage = "alignment files")
+  private List<String> alignments_filenames = new ArrayList<>();
+  
+  // Config filename
+  @Option(name = "--config_file", aliases = {"-c"}, required = false, usage = "(optional) packing configuration file")
+  private String config_filename;
+  
+  @Option(name = "--dump_files", aliases = {"-d"}, handler = StringArrayOptionHandler.class, usage = "(optional) dump feature stats to file")
+  private List<String> featuredump_filenames = new ArrayList<>();
+  
+  @Option(name = "--ga", usage = "whether alignments are present in the grammar")
+  private boolean grammar_alignments = false;
+  
+  @Option(name = "--slice_size", aliases = {"-s"}, required = false, usage = "approximate slice size in # of rules (default=1000000)")
+  private int slice_size = 1000000;
+  
+  
+  private void run() throws IOException {
+
+    final List<String> missingFilenames = new ArrayList<>(grammars.size());
+    for (final String g : grammars) {
+      if (!new File(g).exists()) {
+        missingFilenames.add(g);
+      }
+    }
+    if (!missingFilenames.isEmpty()) {
+      throw new IOException("Input grammar files not found: " + missingFilenames.toString());
+    }
+    
+    if (config_filename != null && !new File(config_filename).exists()) {
+      throw new IOException("Config file not found: " + config_filename);
+    }
+
+    if (!outputs.isEmpty()) {
+      if (outputs.size() != grammars.size()) {
+        throw new IOException("Must provide an output directory for each grammar");
+      }
+      final List<String> existingOutputs = new ArrayList<>(outputs.size());
+      for (final String o : outputs) {
+        if (new File(o).exists()) {
+          existingOutputs.add(o);
+        }
+      }
+      if (!existingOutputs.isEmpty()) {
+        throw new IOException("These output directories already exist (will not overwrite): " + existingOutputs.toString());
+      }
+    }
+    if (outputs.isEmpty()) {
+      for (final String g : grammars) {
+        outputs.add(g + ".packed");
+      }
+    }
+    
+    if (!alignments_filenames.isEmpty()) {
+      final List<String> missingAlignmentFiles = new ArrayList<>(alignments_filenames.size());
+      for (final String a : alignments_filenames) {
+        if (!new File(a).exists()) {
+          missingAlignmentFiles.add(a);
+        }
+      }
+      if (!missingAlignmentFiles.isEmpty()) {
+        throw new IOException("Alignment files not found: " + missingAlignmentFiles.toString());
+      }
+    }
+
+    // create Packer instances for each grammar
+    final List<GrammarPacker> packers = new ArrayList<>(grammars.size());
+    for (int i = 0; i < grammars.size(); i++) {
+      LOG.info("Starting GrammarPacker for {}",  grammars.get(i));
+      final String alignment_filename = alignments_filenames.isEmpty() ? null : alignments_filenames.get(i);
+      final String featuredump_filename = featuredump_filenames.isEmpty() ? null : featuredump_filenames.get(i);
+      final GrammarPacker packer = new GrammarPacker(
+          grammars.get(i),
+          config_filename,
+          outputs.get(i),
+          alignment_filename,
+          featuredump_filename,
+          grammar_alignments,
+          slice_size);
+      packers.add(packer);
+    }
+    
+    // run all packers in sequence, accumulating vocabulary items
+    for (final GrammarPacker packer : packers) {
+      LOG.info("Starting GrammarPacker for {}", packer.getGrammar());
+      packer.pack();
+      LOG.info("PackedGrammar located at {}", packer.getOutputDirectory());
+    }
+    
+    // for each packed grammar, overwrite the internally serialized vocabulary with the current global one.
+    for (final GrammarPacker packer : packers) {
+      LOG.info("Writing final common Vocabulary to {}",  packer.getOutputDirectory());
+      packer.writeVocabulary();
+    }
+  }
+
+  public static void main(String[] args) throws IOException {
+    final GrammarPackerCli cli = new GrammarPackerCli();
+    final CmdLineParser parser = new CmdLineParser(cli);
+
+    try {
+      parser.parseArgument(args);
+      cli.run();
+    } catch (CmdLineException e) {
+      LOG.error(e.getMessage(), e);
+      parser.printUsage(System.err);
+      System.exit(1);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/tools/LabelPhrases.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/tools/LabelPhrases.java b/joshua-core/src/main/java/org/apache/joshua/tools/LabelPhrases.java
new file mode 100644
index 0000000..2fd2b3f
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/tools/LabelPhrases.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.tools;
+
+import java.io.IOException;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.corpus.syntax.ArraySyntaxTree;
+import org.apache.joshua.util.io.LineReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Finds labeling for a set of phrases.
+ * 
+ * @author Juri Ganitkevitch
+ */
+public class LabelPhrases {
+
+  private static final Logger LOG = LoggerFactory.getLogger(LabelPhrases.class);
+
+  /**
+   * Main method.
+   * 
+   * @param args names of the two grammars to be compared
+   * @throws IOException if there is an error reading the input grammars
+   */
+  public static void main(String[] args) throws IOException {
+
+    if (args.length < 1 || args[0].equals("-h")) {
+      System.err.println("Usage: " + LabelPhrases.class.toString());
+      System.err.println("    -p phrase_file     phrase-sentence file to process");
+      System.err.println();
+      System.exit(-1);
+    }
+
+    String phrase_file_name = null;
+
+    for (int i = 0; i < args.length; i++) {
+      if ("-p".equals(args[i])) phrase_file_name = args[++i];
+    }
+    if (phrase_file_name == null) {
+      LOG.error("a phrase file is required for operation");
+      System.exit(-1);
+    }
+
+    LineReader phrase_reader = new LineReader(phrase_file_name);
+
+    while (phrase_reader.ready()) {
+      String line = phrase_reader.readLine();
+
+      String[] fields = line.split("\\t");
+      if (fields.length != 3 || fields[2].equals("()")) {
+        System.err.println("[FAIL] Empty parse in line:\t" + line);
+        continue;
+      }
+
+      String[] phrase_strings = fields[0].split("\\s");
+      int[] phrase_ids = new int[phrase_strings.length];
+      for (int i = 0; i < phrase_strings.length; i++)
+        phrase_ids[i] = Vocabulary.id(phrase_strings[i]);
+
+      ArraySyntaxTree syntax = new ArraySyntaxTree(fields[2]);
+      int[] sentence_ids = syntax.getTerminals();
+
+      int match_start = -1;
+      int match_end = -1;
+      for (int i = 0; i < sentence_ids.length; i++) {
+        if (phrase_ids[0] == sentence_ids[i]) {
+          match_start = i;
+          int j = 0;
+          while (j < phrase_ids.length && phrase_ids[j] == sentence_ids[i + j]) {
+            j++;
+          }
+          if (j == phrase_ids.length) {
+            match_end = i + j;
+            break;
+          }
+        }
+      }
+
+      int label = syntax.getOneConstituent(match_start, match_end);
+      if (label == 0) label = syntax.getOneSingleConcatenation(match_start, match_end);
+      if (label == 0) label = syntax.getOneRightSideCCG(match_start, match_end);
+      if (label == 0) label = syntax.getOneLeftSideCCG(match_start, match_end);
+      if (label == 0) label = syntax.getOneDoubleConcatenation(match_start, match_end);
+      if (label == 0) {
+        System.err.println("[FAIL] No label found in line:\t" + line);
+        continue;
+      }
+
+      System.out.println(Vocabulary.word(label) + "\t" + line);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/tools/TestSetFilter.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/tools/TestSetFilter.java b/joshua-core/src/main/java/org/apache/joshua/tools/TestSetFilter.java
new file mode 100644
index 0000000..ecb2e6e
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/tools/TestSetFilter.java
@@ -0,0 +1,383 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.tools;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import org.apache.joshua.util.io.LineReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class TestSetFilter {
+
+  private static final Logger LOG = LoggerFactory.getLogger(TestSetFilter.class);
+
+  private Filter filter = null;
+
+  // for caching of accepted rules
+  private String lastSourceSide;
+  private boolean acceptedLastSourceSide;
+
+  public int cached = 0;
+  public int RULE_LENGTH = 12;
+  public boolean verbose = false;
+  public boolean parallel = false;
+
+  private static final String DELIMITER = "|||";
+  private static final String DELIMITER_REGEX = " \\|\\|\\| ";
+  public static final String DELIM = String.format(" %s ", DELIMITER);
+  public static final Pattern P_DELIM = Pattern.compile(DELIMITER_REGEX);
+  private final String NT_REGEX = "\\[[^\\]]+?\\]";
+
+  public TestSetFilter() {
+    acceptedLastSourceSide = false;
+    lastSourceSide = null;
+  }
+  
+  public String getFilterName() {
+    if (filter != null)
+      if (filter instanceof FastFilter)
+        return "fast";
+      else if (filter instanceof LooseFilter)
+        return "loose";
+      else
+        return "exact";
+    return "null";
+  }
+
+  public void setVerbose(boolean value) {
+    verbose = value;
+  }
+
+  public void setParallel(boolean value) {
+    parallel = value;
+  }
+
+  public void setFilter(String type) {
+    if (type.equals("fast"))
+      filter = new FastFilter();
+    else if (type.equals("exact"))
+      filter = new ExactFilter();
+    else if (type.equals("loose"))
+      filter = new LooseFilter();
+    else
+      throw new RuntimeException(String.format("Invalid filter type '%s'", type));
+  }
+
+  public void setRuleLength(int value) {
+    RULE_LENGTH = value;
+  }
+
+  private void loadTestSentences(String filename) throws IOException {
+    int count = 0;
+
+    try {
+      for (String line: new LineReader(filename)) {
+        filter.addSentence(line);
+        count++;
+      }
+    } catch (FileNotFoundException e) {
+      LOG.error(e.getMessage(), e);
+    }
+
+    if (verbose)
+      System.err.println(String.format("Added %d sentences.\n", count));
+  }
+
+  /**
+   * Top-level filter, responsible for calling the fast or exact version. Takes the source side 
+   * of a rule and determines whether there is any sentence in the test set that can match it.
+   * @param sourceSide an input source sentence
+   * @return true if is any sentence in the test set can match the source input
+   */
+  public boolean inTestSet(String sourceSide) {
+    if (!sourceSide.equals(lastSourceSide)) {
+      lastSourceSide = sourceSide;
+      acceptedLastSourceSide = filter.permits(sourceSide);
+    } else {
+      cached++;
+    }
+
+    return acceptedLastSourceSide;
+  }
+    
+  /**
+   * Determines whether a rule is an abstract rule. An abstract rule is one that has no terminals on
+   * its source side.
+   * 
+   * If the rule is abstract, the rule's arity is returned. Otherwise, 0 is returned.
+   */
+  private boolean isAbstract(String source) {
+    int nonterminalCount = 0;
+    for (String t : source.split("\\s+")) {
+      if (!t.matches(NT_REGEX))
+        return false;
+      nonterminalCount++;
+    }
+    return nonterminalCount != 0;
+  }
+
+  private interface Filter {
+    /* Tell the filter about a sentence in the test set being filtered to */
+    public void addSentence(String sentence);
+    
+    /* Returns true if the filter permits the specified source side */
+    public boolean permits(String sourceSide);
+  }
+
+  private class FastFilter implements Filter {
+    private Set<String> ngrams = null;
+
+    public FastFilter() {
+      ngrams = new HashSet<String>();
+    }
+    
+    @Override
+    public boolean permits(String source) {
+      for (String chunk : source.split(NT_REGEX)) {
+        chunk = chunk.trim();
+        /* Important: you need to make sure the string isn't empty. */
+        if (!chunk.equals("") && !ngrams.contains(chunk))
+          return false;
+      }
+      return true;
+    }
+
+    @Override
+    public void addSentence(String sentence) {
+      String[] tokens = sentence.trim().split("\\s+");
+      int maxOrder = RULE_LENGTH < tokens.length ? RULE_LENGTH : tokens.length;
+      for (int order = 1; order <= maxOrder; order++) {
+        for (int start = 0; start < tokens.length - order + 1; start++)
+          ngrams.add(createNGram(tokens, start, order));
+      }
+    }
+
+    private String createNGram(String[] tokens, int start, int order) {
+      if (order < 1 || start + order > tokens.length) {
+        return "";
+      }
+      String result = tokens[start];
+      for (int i = 1; i < order; i++)
+        result += " " + tokens[start + i];
+      return result;
+    }
+  }
+
+  private class LooseFilter implements Filter {
+    List<String> testSentences = null;
+
+    public LooseFilter() {
+      testSentences = new ArrayList<String>();
+    }
+    
+    @Override
+    public void addSentence(String source) {
+      testSentences.add(source);
+    }
+
+    @Override
+    public boolean permits(String source) {
+      Pattern pattern = getPattern(source);
+      for (String testSentence : testSentences) {
+        if (pattern.matcher(testSentence).find()) {
+          return true;
+        }
+      }
+      return isAbstract(source);
+    }
+
+    protected Pattern getPattern(String source) {
+      String pattern = source;
+      pattern = pattern.replaceAll(String.format("\\s*%s\\s*", NT_REGEX), ".+");
+      pattern = pattern.replaceAll("\\s+", ".*");
+//      System.err.println(String.format("PATTERN(%s) = %s", source, pattern));
+      return Pattern.compile(pattern);
+    }
+  }
+
+  /**
+   * This class is the same as LooseFilter except with a tighter regex for matching rules.
+   */
+  private class ExactFilter implements Filter {
+    private FastFilter fastFilter = null;
+    private Map<String, Set<Integer>> sentencesByWord;
+    List<String> testSentences = null;
+    
+    public ExactFilter() {
+      fastFilter = new FastFilter();
+      sentencesByWord = new HashMap<String, Set<Integer>>();
+      testSentences = new ArrayList<String>();
+    }
+    
+    @Override
+    public void addSentence(String source) {
+      fastFilter.addSentence(source);
+      addSentenceToWordHash(source, testSentences.size());
+      testSentences.add(source);
+    }
+
+    /**
+     * Always permit abstract rules. Otherwise, query the fast filter, and if that passes, apply
+     * 
+     */
+    @Override
+    public boolean permits(String sourceSide) {
+      if (isAbstract(sourceSide))
+        return true;
+      
+      if (fastFilter.permits(sourceSide)) {
+        Pattern pattern = getPattern(sourceSide);
+        for (int i : getSentencesForRule(sourceSide)) {
+          if (pattern.matcher(testSentences.get(i)).find()) {
+            return true;
+          }
+        }
+      } 
+      return false;
+    }
+    
+    protected Pattern getPattern(String source) {
+      String pattern = Pattern.quote(source);
+      pattern = pattern.replaceAll(NT_REGEX, "\\\\E.+\\\\Q");
+      pattern = pattern.replaceAll("\\\\Q\\\\E", "");
+      pattern = "(?:^|\\s)" + pattern + "(?:$|\\s)";
+      return Pattern.compile(pattern);
+    }
+  
+    /*
+     * Map words to all the sentences they appear in.
+     */
+    private void addSentenceToWordHash(String sentence, int index) {
+      String[] tokens = sentence.split("\\s+");
+      for (String t : tokens) {
+        if (! sentencesByWord.containsKey(t))
+          sentencesByWord.put(t, new HashSet<Integer>());
+        sentencesByWord.get(t).add(index);
+      }
+    }
+    
+    private Set<Integer> getSentencesForRule(String source) {
+      Set<Integer> sentences = null;
+      for (String token : source.split("\\s+")) {
+        if (!token.matches(NT_REGEX)) {
+          if (sentencesByWord.containsKey(token)) {
+            if (sentences == null)
+              sentences = new HashSet<Integer>(sentencesByWord.get(token));
+            else
+              sentences.retainAll(sentencesByWord.get(token));
+          }
+        }
+      }
+      
+      return sentences;
+    }
+  }
+
+  public static void main(String[] argv) throws IOException {
+    // do some setup
+    if (argv.length < 1) {
+      System.err.println("usage: TestSetFilter [-v|-p|-f|-e|-l|-n N|-g grammar] test_set1 [test_set2 ...]");
+      System.err.println("    -g    grammar file (can also be on STDIN)");
+      System.err.println("    -v    verbose output");
+      System.err.println("    -p    parallel compatibility");
+      System.err.println("    -f    fast mode (default)");
+      System.err.println("    -e    exact mode (slower)");
+      System.err.println("    -l    loose mode");
+      System.err.println("    -n    max n-gram to compare to (default 12)");
+      return;
+    }
+    
+    String grammarFile = null;
+
+    TestSetFilter filter = new TestSetFilter();
+
+    for (int i = 0; i < argv.length; i++) {
+      if (argv[i].equals("-v")) {
+        filter.setVerbose(true);
+        continue;
+      } else if (argv[i].equals("-p")) {
+        filter.setParallel(true);
+        continue;
+      } else if (argv[i].equals("-g")) {
+        grammarFile = argv[++i];
+        continue;
+      } else if (argv[i].equals("-f")) {
+        filter.setFilter("fast");
+        continue;
+      } else if (argv[i].equals("-e")) {
+        filter.setFilter("exact");
+        continue;
+      } else if (argv[i].equals("-l")) {
+        filter.setFilter("loose");
+        continue;
+      } else if (argv[i].equals("-n")) {
+        filter.setRuleLength(Integer.parseInt(argv[i + 1]));
+        i++;
+        continue;
+      }
+
+      filter.loadTestSentences(argv[i]);
+    }
+
+    int rulesIn = 0;
+    int rulesOut = 0;
+    if (filter.verbose) {
+      System.err.println(String.format("Filtering rules with the %s filter...", filter.getFilterName()));
+//      System.err.println("Using at max " + filter.RULE_LENGTH + " n-grams...");
+    }
+    LineReader reader = (grammarFile != null) 
+        ? new LineReader(grammarFile, filter.verbose)
+        : new LineReader(System.in); 
+    for (String rule: reader) {
+      rulesIn++;
+
+      String[] parts = P_DELIM.split(rule);
+      if (parts.length >= 4) {
+        // the source is the second field for thrax grammars, first field for phrasal ones 
+        String source = rule.startsWith("[") ? parts[1].trim() : parts[0].trim();
+        if (filter.inTestSet(source)) {
+          System.out.println(rule);
+          if (filter.parallel)
+            System.out.flush();
+          rulesOut++;
+        } else if (filter.parallel) {
+          System.out.println("");
+          System.out.flush();
+        }
+      }
+    }
+    if (filter.verbose) {
+      System.err.println("[INFO] Total rules read: " + rulesIn);
+      System.err.println("[INFO] Rules kept: " + rulesOut);
+      System.err.println("[INFO] Rules dropped: " + (rulesIn - rulesOut));
+      System.err.println("[INFO] cached queries: " + filter.cached);
+    }
+
+    return;
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/ui/Orientation.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/ui/Orientation.java b/joshua-core/src/main/java/org/apache/joshua/ui/Orientation.java
new file mode 100644
index 0000000..4c536ce
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/ui/Orientation.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.ui;
+
+public enum Orientation {
+  HORIZONTAL, VERTICAL
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/ui/StartupWindow.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/ui/StartupWindow.java b/joshua-core/src/main/java/org/apache/joshua/ui/StartupWindow.java
new file mode 100644
index 0000000..cccdd80
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/ui/StartupWindow.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.ui;
+
+import java.awt.BorderLayout;
+import java.awt.Color;
+import java.awt.Font;
+import java.awt.GraphicsEnvironment;
+import java.awt.Image;
+import java.awt.Point;
+
+import javax.swing.BorderFactory;
+import javax.swing.ImageIcon;
+import javax.swing.JLabel;
+import javax.swing.JPanel;
+import javax.swing.JWindow;
+
+/**
+ * Startup window for Joshua programs.
+ * 
+ * @author Lane Schwartz
+ * @author Aaron Phillips
+ */
+public class StartupWindow extends JWindow {
+
+  /** Serialization identifier. */
+  private static final long serialVersionUID = 1L;
+
+  /**
+   * Constructs a splash screen.
+   * 
+   * @param title Title to be displayed
+   */
+  public StartupWindow(String title) {
+    this(title, "Joshua Developers", "2010", Color.BLACK, 5);
+  }
+
+  public StartupWindow(String title, String author, String year, Image image, Color borderColor,
+      int borderWidth) {
+    JPanel content = (JPanel) getContentPane();
+    content.setBackground(Color.WHITE);
+
+    int width = 250;
+    int height = 100;
+
+    Point center = GraphicsEnvironment.getLocalGraphicsEnvironment().getCenterPoint();
+    setBounds(center.x - width / 2, center.y - height / 2, width, height);
+
+    JLabel titleLabel = new JLabel(title, JLabel.CENTER);
+    titleLabel.setFont(new Font("Sans-Serif", Font.BOLD, 24));
+    content.add(titleLabel, BorderLayout.NORTH);
+
+    JLabel copyright = new JLabel("\u24D2 " + year + " - " + author, JLabel.CENTER);
+    copyright.setFont(new Font("Sans-Serif", Font.PLAIN, 8));
+    content.add(copyright, BorderLayout.SOUTH);
+
+    if (image != null) {
+      content.add(new JLabel(new ImageIcon(image)));
+    }
+
+    content.setBorder(BorderFactory.createLineBorder(borderColor, borderWidth));
+
+    // Display it
+    setVisible(true);
+  }
+
+  public StartupWindow(String title, String author, String year, Color borderColor, int borderWidth) {
+    this(title, author, year, null, borderColor, borderWidth);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/ui/package-info.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/ui/package-info.java b/joshua-core/src/main/java/org/apache/joshua/ui/package-info.java
new file mode 100644
index 0000000..1d69516
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/ui/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/**
+ * Provides classes for visualizing parts of the translation process.
+ */
+package org.apache.joshua.ui;
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationTree.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationTree.java b/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationTree.java
new file mode 100644
index 0000000..f09a40a
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationTree.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.ui.tree_visualizer;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Collections;
+
+import org.apache.joshua.ui.tree_visualizer.tree.Tree;
+
+import edu.uci.ics.jung.graph.DirectedOrderedSparseMultigraph;
+import edu.uci.ics.jung.graph.util.EdgeType;
+import edu.uci.ics.jung.graph.util.Pair;
+
+public class DerivationTree extends DirectedOrderedSparseMultigraph<Node, DerivationTreeEdge> {
+  /**
+   * Eclipse thinks this is necessary.
+   */
+  private static final long serialVersionUID = 2914449263979566324L;
+
+  public final Node root;
+  public final Node sourceRoot;
+
+  public DerivationTree(Tree t, String source) {
+    final Tree.Node treeRoot = t.root();
+    final String rootLabel = treeRoot.label();
+    root = new Node(rootLabel, false);
+    sourceRoot = new Node(rootLabel, true);
+    addVertex(root);
+    addVertex(sourceRoot);
+    addSubtreeRootedAt(root, treeRoot);
+    final String[] sourceWords = source.split("\\s+");
+    addSourceSubtreeRootedAt(sourceRoot, treeRoot, 0, sourceWords.length, sourceWords);
+  }
+
+  private void addSubtreeRootedAt(Node n, Tree.Node tn) {
+    for (Tree.Node child : tn.children()) {
+      Node childNode = new Node(child.label(), false);
+      addVertex(childNode);
+      addEdge(new DerivationTreeEdge(false), new Pair<Node>(n, childNode), EdgeType.DIRECTED);
+      addSubtreeRootedAt(childNode, child);
+    }
+  }
+
+  private void addSourceSubtreeRootedAt(Node n, Tree.Node tn, int firstIndex, int lastIndex,
+      String[] sourceWords) {
+    int nextUncoveredIndex = firstIndex;
+    Tree.NodeSourceStartComparator cmp = new Tree.NodeSourceStartComparator();
+    List<Tree.Node> children = tn.children();
+    Collections.sort(children, cmp);
+    for (Tree.Node child : children) {
+      if (child.isLeaf()) {
+        continue;
+      }
+      int sourceStartIndex = child.sourceStartIndex();
+      int sourceEndIndex = child.sourceEndIndex();
+      if (sourceStartIndex > nextUncoveredIndex) {
+        insertSourceLeaf(n, sourceWords, nextUncoveredIndex, sourceStartIndex);
+      }
+      Node childNode = new Node(child.label(), true);
+      addEdge(new DerivationTreeEdge(true), new Pair<Node>(n, childNode), EdgeType.DIRECTED);
+      nextUncoveredIndex = sourceEndIndex;
+      addSourceSubtreeRootedAt(childNode, child, sourceStartIndex, sourceEndIndex, sourceWords);
+    }
+    if (nextUncoveredIndex < lastIndex) {
+      insertSourceLeaf(n, sourceWords, nextUncoveredIndex, lastIndex);
+    }
+  }
+
+  private void insertSourceLeaf(Node n, String[] words, int start, int end) {
+    final String[] leafWords = Arrays.copyOfRange(words, start, end);
+    String label = leafWords[0];
+    for (int i = 1; i < leafWords.length; i++) {
+      label += " " + leafWords[i];
+    }
+    Node childNode = new Node(label, true);
+    addEdge(new DerivationTreeEdge(true), new Pair<Node>(n, childNode), EdgeType.DIRECTED);
+  }
+
+  public void setSubtreeHighlight(Node n, boolean b) {
+    n.isHighlighted = b;
+    for (Node s : getSuccessors(n)) {
+      setSubtreeHighlight(s, b);
+    }
+    return;
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationTreeEdge.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationTreeEdge.java b/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationTreeEdge.java
new file mode 100644
index 0000000..33b6b22
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationTreeEdge.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.ui.tree_visualizer;
+
+public class DerivationTreeEdge {
+  public final boolean pointsToSource;
+
+  public DerivationTreeEdge(boolean pts) {
+    pointsToSource = pts;
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationTreeTransformer.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationTreeTransformer.java b/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationTreeTransformer.java
new file mode 100644
index 0000000..3e4010f
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationTreeTransformer.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.ui.tree_visualizer;
+
+import java.awt.Dimension;
+import java.awt.geom.Point2D;
+
+import org.apache.commons.collections15.Transformer;
+
+import edu.uci.ics.jung.algorithms.layout.TreeLayout;
+import edu.uci.ics.jung.graph.DelegateForest;
+
+public class DerivationTreeTransformer implements Transformer<Node, Point2D> {
+  private TreeLayout<Node, DerivationTreeEdge> treeLayout;
+  private DerivationTree graph;
+  private Node root;
+  private Node sourceRoot;
+
+  private boolean isAnchored;
+  private Point2D anchorPoint;
+
+  private double Y_DIST;
+  private double X_DIST;
+
+
+  public DerivationTreeTransformer(DerivationTree t, Dimension d, boolean isAnchored) {
+    this.isAnchored = isAnchored;
+    anchorPoint = new Point2D.Double(0, 0);
+    graph = t;
+    DelegateForest<Node, DerivationTreeEdge> del = new DelegateForest<Node, DerivationTreeEdge>(t);
+    del.setRoot(t.root);
+    del.setRoot(t.sourceRoot);
+    root = t.root;
+    sourceRoot = t.sourceRoot;
+    Y_DIST = d.getHeight() / (2 * (1 + distanceToLeaf(root)));
+    int leafCount = 0;
+    for (Node n : t.getVertices()) {
+      if (t.outDegree(n) == 0) leafCount++;
+    }
+    X_DIST = d.getWidth() / leafCount;
+
+    treeLayout = new TreeLayout<Node, DerivationTreeEdge>(del, (int) Math.round(X_DIST));
+  }
+
+  public Point2D transform(Node n) {
+    double x, y;
+    Point2D t = treeLayout.transform(n);
+    if (n.isSource) {
+      x =
+          /* treeLayout.transform(root).getX() + */(t.getX()
+              - treeLayout.transform(sourceRoot).getX() + treeLayout.transform(root).getX());
+      y = Y_DIST * (distanceToLeaf(n) + 1);
+    } else {
+      x = t.getX();
+      y = Y_DIST * (-1) * distanceToLeaf(n);
+    }
+    if (isAnchored) {
+      x += anchorPoint.getX();
+      y += anchorPoint.getY();
+    }
+    return new Point2D.Double(x, y + Y_DIST * (1 + distanceToLeaf(root)));
+  }
+
+  private int distanceToLeaf(Node n) {
+    if (graph.getSuccessors(n).isEmpty()) return 0;
+    int result = 0;
+    for (Object x : graph.getSuccessors(n)) {
+      int tmp = distanceToLeaf((Node) x);
+      if (tmp > result) result = tmp;
+    }
+    return 1 + result;
+  }
+
+  public Dimension getSize() {
+    int height = (int) Math.round(2 * Y_DIST * (1 + distanceToLeaf(root)));
+    int width = (int) Math.round(2 * treeLayout.transform(root).getX());
+    Dimension ret = new Dimension(width, height);
+    return ret;
+  }
+
+  public Point2D getAnchorPosition(DerivationViewer.AnchorType type) {
+    switch (type) {
+      case ANCHOR_ROOT:
+        return transform(root);
+      case ANCHOR_LEFTMOST_LEAF:
+        Node n = root;
+        while (graph.getSuccessorCount(n) != 0)
+          n = (Node) graph.getSuccessors(n).toArray()[0];
+        return transform(n);
+      default:
+        return new Point2D.Double(0, 0);
+    }
+  }
+
+  public void setAnchorPoint(DerivationViewer.AnchorType type, Point2D viewerAnchor) {
+    Point2D oldAnchor = getAnchorPosition(type);
+    double x = viewerAnchor.getX() - oldAnchor.getX();
+    double y = viewerAnchor.getY() - oldAnchor.getY();
+    anchorPoint = new Point2D.Double(x, y);
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationViewer.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationViewer.java b/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationViewer.java
new file mode 100644
index 0000000..8c6151d
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationViewer.java
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.ui.tree_visualizer;
+
+import java.awt.BasicStroke;
+import java.awt.Color;
+import java.awt.Dimension;
+import java.awt.Paint;
+import java.awt.Shape;
+import java.awt.Stroke;
+import java.awt.geom.Point2D;
+import java.awt.geom.Rectangle2D;
+
+import javax.swing.JLabel;
+
+import org.apache.commons.collections15.Transformer;
+
+import edu.uci.ics.jung.algorithms.layout.CircleLayout;
+import edu.uci.ics.jung.algorithms.layout.StaticLayout;
+import edu.uci.ics.jung.visualization.VisualizationViewer;
+import edu.uci.ics.jung.visualization.control.DefaultModalGraphMouse;
+import edu.uci.ics.jung.visualization.control.LayoutScalingControl;
+import edu.uci.ics.jung.visualization.control.ModalGraphMouse;
+import edu.uci.ics.jung.visualization.decorators.ToStringLabeller;
+import edu.uci.ics.jung.visualization.renderers.Renderer.VertexLabel.Position;
+
+@SuppressWarnings("serial")
+public class DerivationViewer extends VisualizationViewer<Node, DerivationTreeEdge> {
+  public static final int DEFAULT_HEIGHT = 500;
+  public static final int DEFAULT_WIDTH = 500;
+  public static final Color SRC = Color.WHITE;
+  private Color TGT;
+
+  public static final Color HIGHLIGHT = Color.pink;
+
+  public static enum AnchorType {
+    ANCHOR_ROOT, ANCHOR_LEFTMOST_LEAF
+  };
+
+  private AnchorType anchorStyle;
+  private Point2D anchorPoint;
+
+  public DerivationViewer(DerivationTree g, Dimension d, Color targetColor, AnchorType anchor) {
+    super(new CircleLayout<Node, DerivationTreeEdge>(g));
+    anchorStyle = anchor;
+    DerivationTreeTransformer dtt = new DerivationTreeTransformer(g, d, false);
+    StaticLayout<Node, DerivationTreeEdge> derivationLayout =
+        new StaticLayout<Node, DerivationTreeEdge>(g, dtt);
+    // derivationLayout.setSize(dtt.getSize());
+    setGraphLayout(derivationLayout);
+    scaleToLayout(new LayoutScalingControl());
+    // g.addCorrespondences();
+    setPreferredSize(new Dimension(DEFAULT_HEIGHT, DEFAULT_WIDTH));
+    getRenderContext().setVertexLabelTransformer(new ToStringLabeller<Node>());
+
+    DefaultModalGraphMouse<Node, DerivationTreeEdge> graphMouse =
+        new DefaultModalGraphMouse<Node, DerivationTreeEdge>();
+    graphMouse.setMode(ModalGraphMouse.Mode.TRANSFORMING);
+    setGraphMouse(graphMouse);
+    addKeyListener(graphMouse.getModeKeyListener());
+    // this.setPickedVertexState(new DerivationTreePickedState(g));
+
+    getRenderContext().setVertexFillPaintTransformer(vp);
+    getRenderContext().setEdgeStrokeTransformer(es);
+    getRenderContext().setVertexShapeTransformer(ns);
+    getRenderer().getVertexLabelRenderer().setPosition(Position.CNTR);
+
+    TGT = targetColor;
+    anchorPoint = dtt.getAnchorPosition(anchorStyle);
+  }
+
+  public void setGraph(DerivationTree tree) {
+    DerivationTreeTransformer dtt = new DerivationTreeTransformer(tree, getSize(), true);
+    dtt.setAnchorPoint(anchorStyle, anchorPoint);
+    setGraphLayout(new StaticLayout<Node, DerivationTreeEdge>(tree, dtt));
+  }
+
+  private Transformer<Node, Paint> vp = new Transformer<Node, Paint>() {
+    public Paint transform(Node n) {
+      if (n.isHighlighted) return HIGHLIGHT;
+      if (n.isSource)
+        return SRC;
+      else
+        return TGT;
+    }
+  };
+
+  private static Transformer<DerivationTreeEdge, Stroke> es =
+      new Transformer<DerivationTreeEdge, Stroke>() {
+        public Stroke transform(DerivationTreeEdge e) {
+          if (e.pointsToSource) {
+            return new BasicStroke(1.0f,
+								                   BasicStroke.CAP_BUTT,
+																	 BasicStroke.JOIN_MITER,
+																	 10.0f,
+																	 new float[] {10.0f},
+																	 0.0f);
+					} else {
+            return new BasicStroke(1.0f);
+					}
+        }
+      };
+
+  private static Transformer<Node, Shape> ns = new Transformer<Node, Shape>() {
+    public Shape transform(Node n) {
+      JLabel x = new JLabel();
+      double len = x.getFontMetrics(x.getFont()).stringWidth(n.toString());
+      double margin = 5.0;
+      return new Rectangle2D.Double((len + margin) / (-2), 0, len + 2 * margin, 20);
+    }
+  };
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationViewerApplet.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationViewerApplet.java b/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationViewerApplet.java
new file mode 100644
index 0000000..d6e7a35
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationViewerApplet.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.ui.tree_visualizer;
+
+import java.awt.Color;
+
+import javax.swing.JApplet;
+
+import org.apache.joshua.ui.tree_visualizer.tree.Tree;
+
+/**
+ * An applet for viewing DerivationTrees. It consists of a DerivationViewer inside of the applet's
+ * Panel.
+ * 
+ * @author Jonathan Weese
+ * 
+ */
+@SuppressWarnings("serial")
+public class DerivationViewerApplet extends JApplet {
+  /**
+   * Initializes the applet by getting the source sentence and the tree representation from the
+   * applet tag in a web page.
+   */
+  public void init() {
+    String source = getParameter("sourceSentence");
+    String derivation = getParameter("derivationTree");
+    Tree tree = new Tree(derivation);
+
+    add(new DerivationViewer(new DerivationTree(tree, source),
+        getSize(),
+        Color.red,
+        DerivationViewer.AnchorType.ANCHOR_ROOT));
+    return;
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/Node.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/Node.java b/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/Node.java
new file mode 100644
index 0000000..2ffeb06
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/Node.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.ui.tree_visualizer;
+
+/**
+ * A representation of a node in a derivation tree. The derivation tree class itself is
+ * parameterized in terms of this class and the <code>DerivationEdge</code> class. A
+ * <code>Node</code> may represent either a non-terminal symbol or one or more terminal symbols of
+ * the derivation.
+ */
+public class Node {
+  /**
+   * The label to be shown on the node. If the node is a non-terminal symbol, it is the name of the
+   * symbol. Otherwise, it is terminal symbols joined with spaces.
+   */
+  public final String label;
+
+  /**
+   * Indicates whether this node is part of the source-side of target- side derivation tree.
+   */
+  public final boolean isSource;
+
+  /**
+   * A boolean to let the renderer know whether this vertex is highlighted.
+   */
+  public boolean isHighlighted = false;
+
+  /**
+   * Constructor used for root nodes or nodes whose parent is not given.
+   * 
+   * @param label a <code>String</code> that represents the symbols at this node
+   * @param isSource a boolean saying whether this is a source-side node
+   */
+  public Node(String label, boolean isSource) {
+    this.label = label;
+    this.isSource = isSource;
+  }
+
+	@Override
+  public String toString() {
+    return label;
+  }
+}