You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/06/23 18:45:36 UTC
[25/60] [partial] incubator-joshua git commit: maven multi-module
layout 1st commit: moving files into joshua-core
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/tools/GrammarPacker.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/tools/GrammarPacker.java b/joshua-core/src/main/java/org/apache/joshua/tools/GrammarPacker.java
new file mode 100644
index 0000000..b9208d2
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/tools/GrammarPacker.java
@@ -0,0 +1,959 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.tools;
+
+import static org.apache.joshua.decoder.ff.tm.packed.PackedGrammar.VOCABULARY_FILENAME;
+
+import java.io.BufferedOutputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Queue;
+import java.util.TreeMap;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.ff.tm.format.HieroFormatReader;
+import org.apache.joshua.decoder.ff.tm.format.MosesFormatReader;
+import org.apache.joshua.util.FormatUtils;
+import org.apache.joshua.util.encoding.EncoderConfiguration;
+import org.apache.joshua.util.encoding.FeatureTypeAnalyzer;
+import org.apache.joshua.util.encoding.IntEncoder;
+import org.apache.joshua.util.io.LineReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+public class GrammarPacker {
+
+ private static final Logger LOG = LoggerFactory.getLogger(GrammarPacker.class);
+
+ /**
+ * The packed grammar version number. Increment this any time you add new features, and update
+ * the documentation.
+ *
+ * Version history:
+ *
+ * - 3 (May 2016). This was the first version that was marked. It removed the special phrase-
+ * table packing that packed phrases without the [X,1] on the source and target sides, which
+ * then required special handling in the decoder to use for phrase-based decoding.
+ *
+ *
+ */
+ public static final int VERSION = 3;
+
+ // Size limit for slice in bytes.
+ private static int DATA_SIZE_LIMIT = (int) (Integer.MAX_VALUE * 0.8);
+ // Estimated average number of feature entries for one rule.
+ private static int DATA_SIZE_ESTIMATE = 20;
+
+ private static final String SOURCE_WORDS_SEPARATOR = " ||| ";
+
+ // Output directory name.
+ private String output;
+
+ // Input grammar to be packed.
+ private String grammar;
+
+ public String getGrammar() {
+ return grammar;
+ }
+
+ public String getOutputDirectory() {
+ return output;
+ }
+
+ // Approximate maximum size of a slice in number of rules
+ private int approximateMaximumSliceSize;
+
+ private boolean labeled;
+
+ private boolean packAlignments;
+ private boolean grammarAlignments;
+ private String alignments;
+
+ private FeatureTypeAnalyzer types;
+ private EncoderConfiguration encoderConfig;
+
+ private String dump;
+
+ private int max_source_len;
+
+ public GrammarPacker(String grammar_filename, String config_filename, String output_filename,
+ String alignments_filename, String featuredump_filename, boolean grammar_alignments,
+ int approximateMaximumSliceSize)
+ throws IOException {
+ this.labeled = true;
+ this.grammar = grammar_filename;
+ this.output = output_filename;
+ this.dump = featuredump_filename;
+ this.grammarAlignments = grammar_alignments;
+ this.approximateMaximumSliceSize = approximateMaximumSliceSize;
+ this.max_source_len = 0;
+
+ // TODO: Always open encoder config? This is debatable.
+ this.types = new FeatureTypeAnalyzer(true);
+
+ this.alignments = alignments_filename;
+ packAlignments = grammarAlignments || (alignments != null);
+ if (!packAlignments) {
+ LOG.info("No alignments file or grammar specified, skipping.");
+ } else if (alignments != null && !new File(alignments_filename).exists()) {
+ throw new RuntimeException("Alignments file does not exist: " + alignments);
+ }
+
+ if (config_filename != null) {
+ readConfig(config_filename);
+ types.readConfig(config_filename);
+ } else {
+ LOG.info("No config specified. Attempting auto-detection of feature types.");
+ }
+ LOG.info("Approximate maximum slice size (in # of rules) set to {}", approximateMaximumSliceSize);
+
+ File working_dir = new File(output);
+ working_dir.mkdir();
+ if (!working_dir.exists()) {
+ throw new RuntimeException("Failed creating output directory.");
+ }
+ }
+
+ private void readConfig(String config_filename) throws IOException {
+ LineReader reader = new LineReader(config_filename);
+ while (reader.hasNext()) {
+ // Clean up line, chop comments off and skip if the result is empty.
+ String line = reader.next().trim();
+ if (line.indexOf('#') != -1)
+ line = line.substring(0, line.indexOf('#'));
+ if (line.isEmpty())
+ continue;
+ String[] fields = line.split("[\\s]+");
+
+ if (fields.length < 2) {
+ throw new RuntimeException("Incomplete line in config.");
+ }
+ if ("slice_size".equals(fields[0])) {
+ // Number of records to concurrently load into memory for sorting.
+ approximateMaximumSliceSize = Integer.parseInt(fields[1]);
+ }
+ }
+ reader.close();
+ }
+
+ /**
+ * Executes the packing.
+ *
+ * @throws IOException if there is an error reading the grammar
+ */
+ public void pack() throws IOException {
+ LOG.info("Beginning exploration pass.");
+
+ // Explore pass. Learn vocabulary and feature value histograms.
+ LOG.info("Exploring: {}", grammar);
+
+ HieroFormatReader grammarReader = getGrammarReader();
+ explore(grammarReader);
+
+ LOG.info("Exploration pass complete. Freezing vocabulary and finalizing encoders.");
+ if (dump != null) {
+ PrintWriter dump_writer = new PrintWriter(dump);
+ dump_writer.println(types.toString());
+ dump_writer.close();
+ }
+
+ types.inferTypes(this.labeled);
+ LOG.info("Type inference complete.");
+
+ LOG.info("Finalizing encoding.");
+
+ LOG.info("Writing encoding.");
+ types.write(output + File.separator + "encoding");
+
+ writeVocabulary();
+
+ String configFile = output + File.separator + "config";
+ LOG.info("Writing config to '{}'", configFile);
+ // Write config options
+ FileWriter config = new FileWriter(configFile);
+ config.write(String.format("version = %d\n", VERSION));
+ config.write(String.format("max-source-len = %d\n", max_source_len));
+ config.close();
+
+ // Read previously written encoder configuration to match up to changed
+ // vocabulary id's.
+ LOG.info("Reading encoding.");
+ encoderConfig = new EncoderConfiguration();
+ encoderConfig.load(output + File.separator + "encoding");
+
+ LOG.info("Beginning packing pass.");
+ // Actual binarization pass. Slice and pack source, target and data.
+ grammarReader = getGrammarReader();
+ LineReader alignment_reader = null;
+ if (packAlignments && !grammarAlignments)
+ alignment_reader = new LineReader(alignments);
+ binarize(grammarReader, alignment_reader);
+ LOG.info("Packing complete.");
+
+ LOG.info("Packed grammar in: {}", output);
+ LOG.info("Done.");
+ }
+
+ /**
+ * Returns a reader that turns whatever file format is found into Hiero grammar rules.
+ *
+ * @param grammarFile
+ * @return
+ * @throws IOException
+ */
+ private HieroFormatReader getGrammarReader() throws IOException {
+ LineReader reader = new LineReader(grammar);
+ String line = reader.next();
+ if (line.startsWith("[")) {
+ return new HieroFormatReader(grammar);
+ } else {
+ return new MosesFormatReader(grammar);
+ }
+ }
+
+ /**
+ * This first pass over the grammar
+ * @param reader
+ */
+ private void explore(HieroFormatReader reader) {
+
+ // We always assume a labeled grammar. Unlabeled features are assumed to be dense and to always
+ // appear in the same order. They are assigned numeric names in order of appearance.
+ this.types.setLabeled(true);
+
+ for (Rule rule: reader) {
+
+ max_source_len = Math.max(max_source_len, rule.getFrench().length);
+
+ /* Add symbols to vocabulary.
+ * NOTE: In case of nonterminals, we add both stripped versions ("[X]")
+ * and "[X,1]" to the vocabulary.
+ *
+ * TODO: MJP May 2016: Is it necessary to add [X,1]? This is currently being done in
+ * {@link HieroFormatReader}, which is called by {@link MosesFormatReader}.
+ */
+
+ // Add feature names to vocabulary and pass the value through the
+ // appropriate encoder.
+ int feature_counter = 0;
+ String[] features = rule.getFeatureString().split("\\s+");
+ for (int f = 0; f < features.length; ++f) {
+ if (features[f].contains("=")) {
+ String[] fe = features[f].split("=");
+ if (fe[0].equals("Alignment"))
+ continue;
+ types.observe(Vocabulary.id(fe[0]), Float.parseFloat(fe[1]));
+ } else {
+ types.observe(Vocabulary.id(String.valueOf(feature_counter++)),
+ Float.parseFloat(features[f]));
+ }
+ }
+ }
+ }
+
+ /**
+ * Returns a String encoding the first two source words.
+ * If there is only one source word, use empty string for the second.
+ */
+ private String getFirstTwoSourceWords(final String[] source_words) {
+ return source_words[0] + SOURCE_WORDS_SEPARATOR + ((source_words.length > 1) ? source_words[1] : "");
+ }
+
+ private void binarize(HieroFormatReader grammarReader, LineReader alignment_reader) throws IOException {
+ int counter = 0;
+ int slice_counter = 0;
+ int num_slices = 0;
+
+ boolean ready_to_flush = false;
+ // to determine when flushing is possible
+ String prev_first_two_source_words = null;
+
+ PackingTrie<SourceValue> source_trie = new PackingTrie<SourceValue>();
+ PackingTrie<TargetValue> target_trie = new PackingTrie<TargetValue>();
+ FeatureBuffer feature_buffer = new FeatureBuffer();
+
+ AlignmentBuffer alignment_buffer = null;
+ if (packAlignments)
+ alignment_buffer = new AlignmentBuffer();
+
+ TreeMap<Integer, Float> features = new TreeMap<Integer, Float>();
+ for (Rule rule: grammarReader) {
+ counter++;
+ slice_counter++;
+
+ String lhs_word = Vocabulary.word(rule.getLHS());
+ String[] source_words = rule.getFrenchWords().split("\\s+");
+ String[] target_words = rule.getEnglishWords().split("\\s+");
+ String[] feature_entries = rule.getFeatureString().split("\\s+");
+
+ // Reached slice limit size, indicate that we're closing up.
+ if (!ready_to_flush
+ && (slice_counter > approximateMaximumSliceSize
+ || feature_buffer.overflowing()
+ || (packAlignments && alignment_buffer.overflowing()))) {
+ ready_to_flush = true;
+ // store the first two source words when slice size limit was reached
+ prev_first_two_source_words = getFirstTwoSourceWords(source_words);
+ }
+ // ready to flush
+ if (ready_to_flush) {
+ final String first_two_source_words = getFirstTwoSourceWords(source_words);
+ // the grammar can only be partitioned at the level of first two source word changes.
+ // Thus, we can only flush if the current first two source words differ from the ones
+ // when the slice size limit was reached.
+ if (!first_two_source_words.equals(prev_first_two_source_words)) {
+ LOG.warn("ready to flush and first two words have changed ({} vs. {})",
+ prev_first_two_source_words, first_two_source_words);
+ LOG.info("flushing {} rules to slice.", slice_counter);
+ flush(source_trie, target_trie, feature_buffer, alignment_buffer, num_slices);
+ source_trie.clear();
+ target_trie.clear();
+ feature_buffer.clear();
+ if (packAlignments)
+ alignment_buffer.clear();
+
+ num_slices++;
+ slice_counter = 0;
+ ready_to_flush = false;
+ }
+ }
+
+ int alignment_index = -1;
+ // If present, process alignments.
+ if (packAlignments) {
+ String alignment_line;
+ if (grammarAlignments) {
+ alignment_line = rule.getAlignmentString();
+ } else {
+ if (!alignment_reader.hasNext()) {
+ LOG.error("No more alignments starting in line {}", counter);
+ throw new RuntimeException("No more alignments starting in line " + counter);
+ }
+ alignment_line = alignment_reader.next().trim();
+ }
+ String[] alignment_entries = alignment_line.split("\\s");
+ byte[] alignments = new byte[alignment_entries.length * 2];
+ if (alignment_line.length() > 0) {
+ for (int i = 0; i < alignment_entries.length; i++) {
+ String[] parts = alignment_entries[i].split("-");
+ alignments[2 * i] = Byte.parseByte(parts[0]);
+ alignments[2 * i + 1] = Byte.parseByte(parts[1]);
+ }
+ }
+ alignment_index = alignment_buffer.add(alignments);
+ }
+
+ // Process features.
+ // Implicitly sort via TreeMap, write to data buffer, remember position
+ // to pass on to the source trie node.
+ features.clear();
+ int feature_count = 0;
+ for (int f = 0; f < feature_entries.length; ++f) {
+ String feature_entry = feature_entries[f];
+ int feature_id;
+ float feature_value;
+ if (feature_entry.contains("=")) {
+ String[] parts = feature_entry.split("=");
+ if (parts[0].equals("Alignment"))
+ continue;
+ feature_id = Vocabulary.id(parts[0]);
+ feature_value = Float.parseFloat(parts[1]);
+ } else {
+ feature_id = Vocabulary.id(String.valueOf(feature_count++));
+ feature_value = Float.parseFloat(feature_entry);
+ }
+ if (feature_value != 0)
+ features.put(encoderConfig.innerId(feature_id), feature_value);
+ }
+ int features_index = feature_buffer.add(features);
+
+ // Sanity check on the data block index.
+ if (packAlignments && features_index != alignment_index) {
+ LOG.error("Block index mismatch between features ({}) and alignments ({}).",
+ features_index, alignment_index);
+ throw new RuntimeException("Data block index mismatch.");
+ }
+
+ // Process source side.
+ SourceValue sv = new SourceValue(Vocabulary.id(lhs_word), features_index);
+ int[] source = new int[source_words.length];
+ for (int i = 0; i < source_words.length; i++) {
+ if (FormatUtils.isNonterminal(source_words[i]))
+ source[i] = Vocabulary.id(FormatUtils.stripNonTerminalIndex(source_words[i]));
+ else
+ source[i] = Vocabulary.id(source_words[i]);
+ }
+ source_trie.add(source, sv);
+
+ // Process target side.
+ TargetValue tv = new TargetValue(sv);
+ int[] target = new int[target_words.length];
+ for (int i = 0; i < target_words.length; i++) {
+ if (FormatUtils.isNonterminal(target_words[i])) {
+ target[target_words.length - (i + 1)] = -FormatUtils.getNonterminalIndex(target_words[i]);
+ } else {
+ target[target_words.length - (i + 1)] = Vocabulary.id(target_words[i]);
+ }
+ }
+ target_trie.add(target, tv);
+ }
+ // flush last slice and clear buffers
+ flush(source_trie, target_trie, feature_buffer, alignment_buffer, num_slices);
+ }
+
+ /**
+ * Serializes the source, target and feature data structures into interlinked binary files. Target
+ * is written first, into a skeletal (node don't carry any data) upward-pointing trie, updating
+ * the linking source trie nodes with the position once it is known. Source and feature data are
+ * written simultaneously. The source structure is written into a downward-pointing trie and
+ * stores the rule's lhs as well as links to the target and feature stream. The feature stream is
+ * prompted to write out a block
+ *
+ * @param source_trie
+ * @param target_trie
+ * @param feature_buffer
+ * @param id
+ * @throws IOException
+ */
+ private void flush(PackingTrie<SourceValue> source_trie,
+ PackingTrie<TargetValue> target_trie, FeatureBuffer feature_buffer,
+ AlignmentBuffer alignment_buffer, int id) throws IOException {
+ // Make a slice object for this piece of the grammar.
+ PackingFileTuple slice = new PackingFileTuple("slice_" + String.format("%05d", id));
+ // Pull out the streams for source, target and data output.
+ DataOutputStream source_stream = slice.getSourceOutput();
+ DataOutputStream target_stream = slice.getTargetOutput();
+ DataOutputStream target_lookup_stream = slice.getTargetLookupOutput();
+ DataOutputStream feature_stream = slice.getFeatureOutput();
+ DataOutputStream alignment_stream = slice.getAlignmentOutput();
+
+ Queue<PackingTrie<TargetValue>> target_queue;
+ Queue<PackingTrie<SourceValue>> source_queue;
+
+ // The number of bytes both written into the source stream and
+ // buffered in the source queue.
+ int source_position;
+ // The number of bytes written into the target stream.
+ int target_position;
+
+ // Add trie root into queue, set target position to 0 and set cumulated
+ // size to size of trie root.
+ target_queue = new LinkedList<PackingTrie<TargetValue>>();
+ target_queue.add(target_trie);
+ target_position = 0;
+
+ // Target lookup table for trie levels.
+ int current_level_size = 1;
+ int next_level_size = 0;
+ ArrayList<Integer> target_lookup = new ArrayList<Integer>();
+
+ // Packing loop for upwards-pointing target trie.
+ while (!target_queue.isEmpty()) {
+ // Pop top of queue.
+ PackingTrie<TargetValue> node = target_queue.poll();
+ // Register that this is where we're writing the node to.
+ node.address = target_position;
+ // Tell source nodes that we're writing to this position in the file.
+ for (TargetValue tv : node.values)
+ tv.parent.target = node.address;
+ // Write link to parent.
+ if (node.parent != null)
+ target_stream.writeInt(node.parent.address);
+ else
+ target_stream.writeInt(-1);
+ target_stream.writeInt(node.symbol);
+ // Enqueue children.
+ for (int k : node.children.descendingKeySet()) {
+ PackingTrie<TargetValue> child = node.children.get(k);
+ target_queue.add(child);
+ }
+ target_position += node.size(false, true);
+ next_level_size += node.children.descendingKeySet().size();
+
+ current_level_size--;
+ if (current_level_size == 0) {
+ target_lookup.add(target_position);
+ current_level_size = next_level_size;
+ next_level_size = 0;
+ }
+ }
+ target_lookup_stream.writeInt(target_lookup.size());
+ for (int i : target_lookup)
+ target_lookup_stream.writeInt(i);
+ target_lookup_stream.close();
+
+ // Setting up for source and data writing.
+ source_queue = new LinkedList<PackingTrie<SourceValue>>();
+ source_queue.add(source_trie);
+ source_position = source_trie.size(true, false);
+ source_trie.address = target_position;
+
+ // Ready data buffers for writing.
+ feature_buffer.initialize();
+ if (packAlignments)
+ alignment_buffer.initialize();
+
+ // Packing loop for downwards-pointing source trie.
+ while (!source_queue.isEmpty()) {
+ // Pop top of queue.
+ PackingTrie<SourceValue> node = source_queue.poll();
+ // Write number of children.
+ source_stream.writeInt(node.children.size());
+ // Write links to children.
+ for (int k : node.children.descendingKeySet()) {
+ PackingTrie<SourceValue> child = node.children.get(k);
+ // Enqueue child.
+ source_queue.add(child);
+ // Child's address will be at the current end of the queue.
+ child.address = source_position;
+ // Advance cumulated size by child's size.
+ source_position += child.size(true, false);
+ // Write the link.
+ source_stream.writeInt(k);
+ source_stream.writeInt(child.address);
+ }
+ // Write number of data items.
+ source_stream.writeInt(node.values.size());
+ // Write lhs and links to target and data.
+ for (SourceValue sv : node.values) {
+ int feature_block_index = feature_buffer.write(sv.data);
+ if (packAlignments) {
+ int alignment_block_index = alignment_buffer.write(sv.data);
+ if (alignment_block_index != feature_block_index) {
+ LOG.error("Block index mismatch.");
+ throw new RuntimeException("Block index mismatch: alignment (" + alignment_block_index
+ + ") and features (" + feature_block_index + ") don't match.");
+ }
+ }
+ source_stream.writeInt(sv.lhs);
+ source_stream.writeInt(sv.target);
+ source_stream.writeInt(feature_block_index);
+ }
+ }
+ // Flush the data stream.
+ feature_buffer.flush(feature_stream);
+ if (packAlignments)
+ alignment_buffer.flush(alignment_stream);
+
+ target_stream.close();
+ source_stream.close();
+ feature_stream.close();
+ if (packAlignments)
+ alignment_stream.close();
+ }
+
+ public void writeVocabulary() throws IOException {
+ final String vocabularyFilename = output + File.separator + VOCABULARY_FILENAME;
+ LOG.info("Writing vocabulary to {}", vocabularyFilename);
+ Vocabulary.write(vocabularyFilename);
+ }
+
+ /**
+ * Integer-labeled, doubly-linked trie with some provisions for packing.
+ *
+ * @author Juri Ganitkevitch
+ *
+ * @param <D> The trie's value type.
+ */
+ class PackingTrie<D extends PackingTrieValue> {
+ int symbol;
+ PackingTrie<D> parent;
+
+ TreeMap<Integer, PackingTrie<D>> children;
+ List<D> values;
+
+ int address;
+
+ PackingTrie() {
+ address = -1;
+
+ symbol = 0;
+ parent = null;
+
+ children = new TreeMap<Integer, PackingTrie<D>>();
+ values = new ArrayList<D>();
+ }
+
+ PackingTrie(PackingTrie<D> parent, int symbol) {
+ this();
+ this.parent = parent;
+ this.symbol = symbol;
+ }
+
+ void add(int[] path, D value) {
+ add(path, 0, value);
+ }
+
+ private void add(int[] path, int index, D value) {
+ if (index == path.length)
+ this.values.add(value);
+ else {
+ PackingTrie<D> child = children.get(path[index]);
+ if (child == null) {
+ child = new PackingTrie<D>(this, path[index]);
+ children.put(path[index], child);
+ }
+ child.add(path, index + 1, value);
+ }
+ }
+
+ /**
+ * Calculate the size (in ints) of a packed trie node. Distinguishes downwards pointing (parent
+ * points to children) from upwards pointing (children point to parent) tries, as well as
+ * skeletal (no data, just the labeled links) and non-skeletal (nodes have a data block)
+ * packing.
+ *
+ * @param downwards Are we packing into a downwards-pointing trie?
+ * @param skeletal Are we packing into a skeletal trie?
+ *
+ * @return Number of bytes the trie node would occupy.
+ */
+ int size(boolean downwards, boolean skeletal) {
+ int size = 0;
+ if (downwards) {
+ // Number of children and links to children.
+ size = 1 + 2 * children.size();
+ } else {
+ // Link to parent.
+ size += 2;
+ }
+ // Non-skeletal packing: number of data items.
+ if (!skeletal)
+ size += 1;
+ // Non-skeletal packing: write size taken up by data items.
+ if (!skeletal && !values.isEmpty())
+ size += values.size() * values.get(0).size();
+
+ return size;
+ }
+
+ void clear() {
+ children.clear();
+ values.clear();
+ }
+ }
+
+ interface PackingTrieValue {
+ int size();
+ }
+
+ class SourceValue implements PackingTrieValue {
+ int lhs;
+ int data;
+ int target;
+
+ public SourceValue() {
+ }
+
+ SourceValue(int lhs, int data) {
+ this.lhs = lhs;
+ this.data = data;
+ }
+
+ void setTarget(int target) {
+ this.target = target;
+ }
+
+ public int size() {
+ return 3;
+ }
+ }
+
+ class TargetValue implements PackingTrieValue {
+ SourceValue parent;
+
+ TargetValue(SourceValue parent) {
+ this.parent = parent;
+ }
+
+ public int size() {
+ return 0;
+ }
+ }
+
+ abstract class PackingBuffer<T> {
+ private byte[] backing;
+ protected ByteBuffer buffer;
+
+ protected ArrayList<Integer> memoryLookup;
+ protected int totalSize;
+ protected ArrayList<Integer> onDiskOrder;
+
+ PackingBuffer() throws IOException {
+ allocate();
+ memoryLookup = new ArrayList<Integer>();
+ onDiskOrder = new ArrayList<Integer>();
+ totalSize = 0;
+ }
+
+ abstract int add(T item);
+
+ // Allocate a reasonably-sized buffer for the feature data.
+ private void allocate() {
+ backing = new byte[approximateMaximumSliceSize * DATA_SIZE_ESTIMATE];
+ buffer = ByteBuffer.wrap(backing);
+ }
+
+ // Reallocate the backing array and buffer, copies data over.
+ protected void reallocate() {
+ if (backing.length == Integer.MAX_VALUE)
+ return;
+ long attempted_length = backing.length * 2l;
+ int new_length;
+ // Detect overflow.
+ if (attempted_length >= Integer.MAX_VALUE)
+ new_length = Integer.MAX_VALUE;
+ else
+ new_length = (int) attempted_length;
+ byte[] new_backing = new byte[new_length];
+ System.arraycopy(backing, 0, new_backing, 0, backing.length);
+ int old_position = buffer.position();
+ ByteBuffer new_buffer = ByteBuffer.wrap(new_backing);
+ new_buffer.position(old_position);
+ buffer = new_buffer;
+ backing = new_backing;
+ }
+
+ /**
+ * Prepare the data buffer for disk writing.
+ */
+ void initialize() {
+ onDiskOrder.clear();
+ }
+
+ /**
+ * Enqueue a data block for later writing.
+ *
+ * @param block_index The index of the data block to add to writing queue.
+ * @return The to-be-written block's output index.
+ */
+ int write(int block_index) {
+ onDiskOrder.add(block_index);
+ return onDiskOrder.size() - 1;
+ }
+
+ /**
+ * Performs the actual writing to disk in the order specified by calls to write() since the last
+ * call to initialize().
+ *
+ * @param out
+ * @throws IOException
+ */
+ void flush(DataOutputStream out) throws IOException {
+ writeHeader(out);
+ int size;
+ int block_address;
+ for (int block_index : onDiskOrder) {
+ block_address = memoryLookup.get(block_index);
+ size = blockSize(block_index);
+ out.write(backing, block_address, size);
+ }
+ }
+
+ void clear() {
+ buffer.clear();
+ memoryLookup.clear();
+ onDiskOrder.clear();
+ }
+
+ boolean overflowing() {
+ return (buffer.position() >= DATA_SIZE_LIMIT);
+ }
+
+ private void writeHeader(DataOutputStream out) throws IOException {
+ if (out.size() == 0) {
+ out.writeInt(onDiskOrder.size());
+ out.writeInt(totalSize);
+ int disk_position = headerSize();
+ for (int block_index : onDiskOrder) {
+ out.writeInt(disk_position);
+ disk_position += blockSize(block_index);
+ }
+ } else {
+ throw new RuntimeException("Got a used stream for header writing.");
+ }
+ }
+
+ private int headerSize() {
+ // One integer for each data block, plus number of blocks and total size.
+ return 4 * (onDiskOrder.size() + 2);
+ }
+
+ private int blockSize(int block_index) {
+ int block_address = memoryLookup.get(block_index);
+ return (block_index < memoryLookup.size() - 1 ? memoryLookup.get(block_index + 1) : totalSize)
+ - block_address;
+ }
+ }
+
+ class FeatureBuffer extends PackingBuffer<TreeMap<Integer, Float>> {
+
+ private IntEncoder idEncoder;
+
+ FeatureBuffer() throws IOException {
+ super();
+ idEncoder = types.getIdEncoder();
+ LOG.info("Encoding feature ids in: {}", idEncoder.getKey());
+ }
+
+ /**
+ * Add a block of features to the buffer.
+ *
+ * @param features TreeMap with the features for one rule.
+ * @return The index of the resulting data block.
+ */
+ int add(TreeMap<Integer, Float> features) {
+ int data_position = buffer.position();
+
+ // Over-estimate how much room this addition will need: for each
+ // feature (ID_SIZE for label, "upper bound" of 4 for the value), plus ID_SIZE for
+ // the number of features. If this won't fit, reallocate the buffer.
+ int size_estimate = (4 + EncoderConfiguration.ID_SIZE) * features.size()
+ + EncoderConfiguration.ID_SIZE;
+ if (buffer.capacity() - buffer.position() <= size_estimate)
+ reallocate();
+
+ // Write features to buffer.
+ idEncoder.write(buffer, features.size());
+ for (Integer k : features.descendingKeySet()) {
+ float v = features.get(k);
+ // Sparse features.
+ if (v != 0.0) {
+ idEncoder.write(buffer, k);
+ encoderConfig.encoder(k).write(buffer, v);
+ }
+ }
+ // Store position the block was written to.
+ memoryLookup.add(data_position);
+ // Update total size (in bytes).
+ totalSize = buffer.position();
+
+ // Return block index.
+ return memoryLookup.size() - 1;
+ }
+ }
+
+ class AlignmentBuffer extends PackingBuffer<byte[]> {
+
+ AlignmentBuffer() throws IOException {
+ super();
+ }
+
+ /**
+ * Add a rule alignments to the buffer.
+ *
+ * @param alignments a byte array with the alignment points for one rule.
+ * @return The index of the resulting data block.
+ */
+ int add(byte[] alignments) {
+ int data_position = buffer.position();
+ int size_estimate = alignments.length + 1;
+ if (buffer.capacity() - buffer.position() <= size_estimate)
+ reallocate();
+
+ // Write alignment points to buffer.
+ buffer.put((byte) (alignments.length / 2));
+ buffer.put(alignments);
+
+ // Store position the block was written to.
+ memoryLookup.add(data_position);
+ // Update total size (in bytes).
+ totalSize = buffer.position();
+ // Return block index.
+ return memoryLookup.size() - 1;
+ }
+ }
+
+ class PackingFileTuple implements Comparable<PackingFileTuple> {
+ private File sourceFile;
+ private File targetLookupFile;
+ private File targetFile;
+
+ private File featureFile;
+ private File alignmentFile;
+
+ PackingFileTuple(String prefix) {
+ sourceFile = new File(output + File.separator + prefix + ".source");
+ targetFile = new File(output + File.separator + prefix + ".target");
+ targetLookupFile = new File(output + File.separator + prefix + ".target.lookup");
+ featureFile = new File(output + File.separator + prefix + ".features");
+
+ alignmentFile = null;
+ if (packAlignments)
+ alignmentFile = new File(output + File.separator + prefix + ".alignments");
+
+ LOG.info("Allocated slice: {}", sourceFile.getAbsolutePath());
+ }
+
+ DataOutputStream getSourceOutput() throws IOException {
+ return getOutput(sourceFile);
+ }
+
+ DataOutputStream getTargetOutput() throws IOException {
+ return getOutput(targetFile);
+ }
+
+ DataOutputStream getTargetLookupOutput() throws IOException {
+ return getOutput(targetLookupFile);
+ }
+
+ DataOutputStream getFeatureOutput() throws IOException {
+ return getOutput(featureFile);
+ }
+
+ DataOutputStream getAlignmentOutput() throws IOException {
+ if (alignmentFile != null)
+ return getOutput(alignmentFile);
+ return null;
+ }
+
+ private DataOutputStream getOutput(File file) throws IOException {
+ if (file.createNewFile()) {
+ return new DataOutputStream(new BufferedOutputStream(new FileOutputStream(file)));
+ } else {
+ throw new RuntimeException("File doesn't exist: " + file.getName());
+ }
+ }
+
+ long getSize() {
+ return sourceFile.length() + targetFile.length() + featureFile.length();
+ }
+
+ @Override
+ public int compareTo(PackingFileTuple o) {
+ if (getSize() > o.getSize()) {
+ return -1;
+ } else if (getSize() < o.getSize()) {
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/tools/GrammarPackerCli.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/tools/GrammarPackerCli.java b/joshua-core/src/main/java/org/apache/joshua/tools/GrammarPackerCli.java
new file mode 100644
index 0000000..3cd4d0c
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/tools/GrammarPackerCli.java
@@ -0,0 +1,156 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.tools;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.kohsuke.args4j.CmdLineException;
+import org.kohsuke.args4j.CmdLineParser;
+import org.kohsuke.args4j.Option;
+import org.kohsuke.args4j.spi.StringArrayOptionHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class GrammarPackerCli {
+
+ private static final Logger LOG = LoggerFactory.getLogger(GrammarPackerCli.class);
+
+ // Input grammars to be packed (with a joint vocabulary)
+ @Option(name = "--grammars", aliases = {"-g", "-i"}, handler = StringArrayOptionHandler.class, required = true, usage = "list of grammars to pack (jointly, i.e. they share the same vocabulary)")
+ private List<String> grammars = new ArrayList<>();
+
+ // Output grammars
+ @Option(name = "--outputs", aliases = {"-p", "-o"}, handler = StringArrayOptionHandler.class, required = true, usage = "output directories of packed grammars.")
+ private List<String> outputs = new ArrayList<>();
+
+ // Output grammars
+ @Option(name = "--alignments", aliases = {"-a", "--fa"}, handler = StringArrayOptionHandler.class, required = false, usage = "alignment files")
+ private List<String> alignments_filenames = new ArrayList<>();
+
+ // Config filename
+ @Option(name = "--config_file", aliases = {"-c"}, required = false, usage = "(optional) packing configuration file")
+ private String config_filename;
+
+ @Option(name = "--dump_files", aliases = {"-d"}, handler = StringArrayOptionHandler.class, usage = "(optional) dump feature stats to file")
+ private List<String> featuredump_filenames = new ArrayList<>();
+
+ @Option(name = "--ga", usage = "whether alignments are present in the grammar")
+ private boolean grammar_alignments = false;
+
+ @Option(name = "--slice_size", aliases = {"-s"}, required = false, usage = "approximate slice size in # of rules (default=1000000)")
+ private int slice_size = 1000000;
+
+
+ private void run() throws IOException {
+
+ final List<String> missingFilenames = new ArrayList<>(grammars.size());
+ for (final String g : grammars) {
+ if (!new File(g).exists()) {
+ missingFilenames.add(g);
+ }
+ }
+ if (!missingFilenames.isEmpty()) {
+ throw new IOException("Input grammar files not found: " + missingFilenames.toString());
+ }
+
+ if (config_filename != null && !new File(config_filename).exists()) {
+ throw new IOException("Config file not found: " + config_filename);
+ }
+
+ if (!outputs.isEmpty()) {
+ if (outputs.size() != grammars.size()) {
+ throw new IOException("Must provide an output directory for each grammar");
+ }
+ final List<String> existingOutputs = new ArrayList<>(outputs.size());
+ for (final String o : outputs) {
+ if (new File(o).exists()) {
+ existingOutputs.add(o);
+ }
+ }
+ if (!existingOutputs.isEmpty()) {
+ throw new IOException("These output directories already exist (will not overwrite): " + existingOutputs.toString());
+ }
+ }
+ if (outputs.isEmpty()) {
+ for (final String g : grammars) {
+ outputs.add(g + ".packed");
+ }
+ }
+
+ if (!alignments_filenames.isEmpty()) {
+ final List<String> missingAlignmentFiles = new ArrayList<>(alignments_filenames.size());
+ for (final String a : alignments_filenames) {
+ if (!new File(a).exists()) {
+ missingAlignmentFiles.add(a);
+ }
+ }
+ if (!missingAlignmentFiles.isEmpty()) {
+ throw new IOException("Alignment files not found: " + missingAlignmentFiles.toString());
+ }
+ }
+
+ // create Packer instances for each grammar
+ final List<GrammarPacker> packers = new ArrayList<>(grammars.size());
+ for (int i = 0; i < grammars.size(); i++) {
+ LOG.info("Starting GrammarPacker for {}", grammars.get(i));
+ final String alignment_filename = alignments_filenames.isEmpty() ? null : alignments_filenames.get(i);
+ final String featuredump_filename = featuredump_filenames.isEmpty() ? null : featuredump_filenames.get(i);
+ final GrammarPacker packer = new GrammarPacker(
+ grammars.get(i),
+ config_filename,
+ outputs.get(i),
+ alignment_filename,
+ featuredump_filename,
+ grammar_alignments,
+ slice_size);
+ packers.add(packer);
+ }
+
+ // run all packers in sequence, accumulating vocabulary items
+ for (final GrammarPacker packer : packers) {
+ LOG.info("Starting GrammarPacker for {}", packer.getGrammar());
+ packer.pack();
+ LOG.info("PackedGrammar located at {}", packer.getOutputDirectory());
+ }
+
+ // for each packed grammar, overwrite the internally serialized vocabulary with the current global one.
+ for (final GrammarPacker packer : packers) {
+ LOG.info("Writing final common Vocabulary to {}", packer.getOutputDirectory());
+ packer.writeVocabulary();
+ }
+ }
+
+ public static void main(String[] args) throws IOException {
+ final GrammarPackerCli cli = new GrammarPackerCli();
+ final CmdLineParser parser = new CmdLineParser(cli);
+
+ try {
+ parser.parseArgument(args);
+ cli.run();
+ } catch (CmdLineException e) {
+ LOG.error(e.getMessage(), e);
+ parser.printUsage(System.err);
+ System.exit(1);
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/tools/LabelPhrases.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/tools/LabelPhrases.java b/joshua-core/src/main/java/org/apache/joshua/tools/LabelPhrases.java
new file mode 100644
index 0000000..2fd2b3f
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/tools/LabelPhrases.java
@@ -0,0 +1,111 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.tools;
+
+import java.io.IOException;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.corpus.syntax.ArraySyntaxTree;
+import org.apache.joshua.util.io.LineReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Finds labeling for a set of phrases.
+ *
+ * @author Juri Ganitkevitch
+ */
+public class LabelPhrases {
+
+ private static final Logger LOG = LoggerFactory.getLogger(LabelPhrases.class);
+
+ /**
+ * Main method.
+ *
+ * @param args names of the two grammars to be compared
+ * @throws IOException if there is an error reading the input grammars
+ */
+ public static void main(String[] args) throws IOException {
+
+ if (args.length < 1 || args[0].equals("-h")) {
+ System.err.println("Usage: " + LabelPhrases.class.toString());
+ System.err.println(" -p phrase_file phrase-sentence file to process");
+ System.err.println();
+ System.exit(-1);
+ }
+
+ String phrase_file_name = null;
+
+ for (int i = 0; i < args.length; i++) {
+ if ("-p".equals(args[i])) phrase_file_name = args[++i];
+ }
+ if (phrase_file_name == null) {
+ LOG.error("a phrase file is required for operation");
+ System.exit(-1);
+ }
+
+ LineReader phrase_reader = new LineReader(phrase_file_name);
+
+ while (phrase_reader.ready()) {
+ String line = phrase_reader.readLine();
+
+ String[] fields = line.split("\\t");
+ if (fields.length != 3 || fields[2].equals("()")) {
+ System.err.println("[FAIL] Empty parse in line:\t" + line);
+ continue;
+ }
+
+ String[] phrase_strings = fields[0].split("\\s");
+ int[] phrase_ids = new int[phrase_strings.length];
+ for (int i = 0; i < phrase_strings.length; i++)
+ phrase_ids[i] = Vocabulary.id(phrase_strings[i]);
+
+ ArraySyntaxTree syntax = new ArraySyntaxTree(fields[2]);
+ int[] sentence_ids = syntax.getTerminals();
+
+ int match_start = -1;
+ int match_end = -1;
+ for (int i = 0; i < sentence_ids.length; i++) {
+ if (phrase_ids[0] == sentence_ids[i]) {
+ match_start = i;
+ int j = 0;
+ while (j < phrase_ids.length && phrase_ids[j] == sentence_ids[i + j]) {
+ j++;
+ }
+ if (j == phrase_ids.length) {
+ match_end = i + j;
+ break;
+ }
+ }
+ }
+
+ int label = syntax.getOneConstituent(match_start, match_end);
+ if (label == 0) label = syntax.getOneSingleConcatenation(match_start, match_end);
+ if (label == 0) label = syntax.getOneRightSideCCG(match_start, match_end);
+ if (label == 0) label = syntax.getOneLeftSideCCG(match_start, match_end);
+ if (label == 0) label = syntax.getOneDoubleConcatenation(match_start, match_end);
+ if (label == 0) {
+ System.err.println("[FAIL] No label found in line:\t" + line);
+ continue;
+ }
+
+ System.out.println(Vocabulary.word(label) + "\t" + line);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/tools/TestSetFilter.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/tools/TestSetFilter.java b/joshua-core/src/main/java/org/apache/joshua/tools/TestSetFilter.java
new file mode 100644
index 0000000..ecb2e6e
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/tools/TestSetFilter.java
@@ -0,0 +1,383 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.tools;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+import org.apache.joshua.util.io.LineReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class TestSetFilter {
+
+ private static final Logger LOG = LoggerFactory.getLogger(TestSetFilter.class);
+
+ private Filter filter = null;
+
+ // for caching of accepted rules
+ private String lastSourceSide;
+ private boolean acceptedLastSourceSide;
+
+ public int cached = 0;
+ public int RULE_LENGTH = 12;
+ public boolean verbose = false;
+ public boolean parallel = false;
+
+ private static final String DELIMITER = "|||";
+ private static final String DELIMITER_REGEX = " \\|\\|\\| ";
+ public static final String DELIM = String.format(" %s ", DELIMITER);
+ public static final Pattern P_DELIM = Pattern.compile(DELIMITER_REGEX);
+ private final String NT_REGEX = "\\[[^\\]]+?\\]";
+
+ public TestSetFilter() {
+ acceptedLastSourceSide = false;
+ lastSourceSide = null;
+ }
+
+ public String getFilterName() {
+ if (filter != null)
+ if (filter instanceof FastFilter)
+ return "fast";
+ else if (filter instanceof LooseFilter)
+ return "loose";
+ else
+ return "exact";
+ return "null";
+ }
+
+ public void setVerbose(boolean value) {
+ verbose = value;
+ }
+
+ public void setParallel(boolean value) {
+ parallel = value;
+ }
+
+ public void setFilter(String type) {
+ if (type.equals("fast"))
+ filter = new FastFilter();
+ else if (type.equals("exact"))
+ filter = new ExactFilter();
+ else if (type.equals("loose"))
+ filter = new LooseFilter();
+ else
+ throw new RuntimeException(String.format("Invalid filter type '%s'", type));
+ }
+
+ public void setRuleLength(int value) {
+ RULE_LENGTH = value;
+ }
+
+ private void loadTestSentences(String filename) throws IOException {
+ int count = 0;
+
+ try {
+ for (String line: new LineReader(filename)) {
+ filter.addSentence(line);
+ count++;
+ }
+ } catch (FileNotFoundException e) {
+ LOG.error(e.getMessage(), e);
+ }
+
+ if (verbose)
+ System.err.println(String.format("Added %d sentences.\n", count));
+ }
+
+ /**
+ * Top-level filter, responsible for calling the fast or exact version. Takes the source side
+ * of a rule and determines whether there is any sentence in the test set that can match it.
+ * @param sourceSide an input source sentence
+ * @return true if is any sentence in the test set can match the source input
+ */
+ public boolean inTestSet(String sourceSide) {
+ if (!sourceSide.equals(lastSourceSide)) {
+ lastSourceSide = sourceSide;
+ acceptedLastSourceSide = filter.permits(sourceSide);
+ } else {
+ cached++;
+ }
+
+ return acceptedLastSourceSide;
+ }
+
+ /**
+ * Determines whether a rule is an abstract rule. An abstract rule is one that has no terminals on
+ * its source side.
+ *
+ * If the rule is abstract, the rule's arity is returned. Otherwise, 0 is returned.
+ */
+ private boolean isAbstract(String source) {
+ int nonterminalCount = 0;
+ for (String t : source.split("\\s+")) {
+ if (!t.matches(NT_REGEX))
+ return false;
+ nonterminalCount++;
+ }
+ return nonterminalCount != 0;
+ }
+
+ private interface Filter {
+ /* Tell the filter about a sentence in the test set being filtered to */
+ public void addSentence(String sentence);
+
+ /* Returns true if the filter permits the specified source side */
+ public boolean permits(String sourceSide);
+ }
+
+ private class FastFilter implements Filter {
+ private Set<String> ngrams = null;
+
+ public FastFilter() {
+ ngrams = new HashSet<String>();
+ }
+
+ @Override
+ public boolean permits(String source) {
+ for (String chunk : source.split(NT_REGEX)) {
+ chunk = chunk.trim();
+ /* Important: you need to make sure the string isn't empty. */
+ if (!chunk.equals("") && !ngrams.contains(chunk))
+ return false;
+ }
+ return true;
+ }
+
+ @Override
+ public void addSentence(String sentence) {
+ String[] tokens = sentence.trim().split("\\s+");
+ int maxOrder = RULE_LENGTH < tokens.length ? RULE_LENGTH : tokens.length;
+ for (int order = 1; order <= maxOrder; order++) {
+ for (int start = 0; start < tokens.length - order + 1; start++)
+ ngrams.add(createNGram(tokens, start, order));
+ }
+ }
+
+ private String createNGram(String[] tokens, int start, int order) {
+ if (order < 1 || start + order > tokens.length) {
+ return "";
+ }
+ String result = tokens[start];
+ for (int i = 1; i < order; i++)
+ result += " " + tokens[start + i];
+ return result;
+ }
+ }
+
+ private class LooseFilter implements Filter {
+ List<String> testSentences = null;
+
+ public LooseFilter() {
+ testSentences = new ArrayList<String>();
+ }
+
+ @Override
+ public void addSentence(String source) {
+ testSentences.add(source);
+ }
+
+ @Override
+ public boolean permits(String source) {
+ Pattern pattern = getPattern(source);
+ for (String testSentence : testSentences) {
+ if (pattern.matcher(testSentence).find()) {
+ return true;
+ }
+ }
+ return isAbstract(source);
+ }
+
+ protected Pattern getPattern(String source) {
+ String pattern = source;
+ pattern = pattern.replaceAll(String.format("\\s*%s\\s*", NT_REGEX), ".+");
+ pattern = pattern.replaceAll("\\s+", ".*");
+// System.err.println(String.format("PATTERN(%s) = %s", source, pattern));
+ return Pattern.compile(pattern);
+ }
+ }
+
+ /**
+ * This class is the same as LooseFilter except with a tighter regex for matching rules.
+ */
+ private class ExactFilter implements Filter {
+ private FastFilter fastFilter = null;
+ private Map<String, Set<Integer>> sentencesByWord;
+ List<String> testSentences = null;
+
+ public ExactFilter() {
+ fastFilter = new FastFilter();
+ sentencesByWord = new HashMap<String, Set<Integer>>();
+ testSentences = new ArrayList<String>();
+ }
+
+ @Override
+ public void addSentence(String source) {
+ fastFilter.addSentence(source);
+ addSentenceToWordHash(source, testSentences.size());
+ testSentences.add(source);
+ }
+
+ /**
+ * Always permit abstract rules. Otherwise, query the fast filter, and if that passes, apply
+ *
+ */
+ @Override
+ public boolean permits(String sourceSide) {
+ if (isAbstract(sourceSide))
+ return true;
+
+ if (fastFilter.permits(sourceSide)) {
+ Pattern pattern = getPattern(sourceSide);
+ for (int i : getSentencesForRule(sourceSide)) {
+ if (pattern.matcher(testSentences.get(i)).find()) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ protected Pattern getPattern(String source) {
+ String pattern = Pattern.quote(source);
+ pattern = pattern.replaceAll(NT_REGEX, "\\\\E.+\\\\Q");
+ pattern = pattern.replaceAll("\\\\Q\\\\E", "");
+ pattern = "(?:^|\\s)" + pattern + "(?:$|\\s)";
+ return Pattern.compile(pattern);
+ }
+
+ /*
+ * Map words to all the sentences they appear in.
+ */
+ private void addSentenceToWordHash(String sentence, int index) {
+ String[] tokens = sentence.split("\\s+");
+ for (String t : tokens) {
+ if (! sentencesByWord.containsKey(t))
+ sentencesByWord.put(t, new HashSet<Integer>());
+ sentencesByWord.get(t).add(index);
+ }
+ }
+
+ private Set<Integer> getSentencesForRule(String source) {
+ Set<Integer> sentences = null;
+ for (String token : source.split("\\s+")) {
+ if (!token.matches(NT_REGEX)) {
+ if (sentencesByWord.containsKey(token)) {
+ if (sentences == null)
+ sentences = new HashSet<Integer>(sentencesByWord.get(token));
+ else
+ sentences.retainAll(sentencesByWord.get(token));
+ }
+ }
+ }
+
+ return sentences;
+ }
+ }
+
+ public static void main(String[] argv) throws IOException {
+ // do some setup
+ if (argv.length < 1) {
+ System.err.println("usage: TestSetFilter [-v|-p|-f|-e|-l|-n N|-g grammar] test_set1 [test_set2 ...]");
+ System.err.println(" -g grammar file (can also be on STDIN)");
+ System.err.println(" -v verbose output");
+ System.err.println(" -p parallel compatibility");
+ System.err.println(" -f fast mode (default)");
+ System.err.println(" -e exact mode (slower)");
+ System.err.println(" -l loose mode");
+ System.err.println(" -n max n-gram to compare to (default 12)");
+ return;
+ }
+
+ String grammarFile = null;
+
+ TestSetFilter filter = new TestSetFilter();
+
+ for (int i = 0; i < argv.length; i++) {
+ if (argv[i].equals("-v")) {
+ filter.setVerbose(true);
+ continue;
+ } else if (argv[i].equals("-p")) {
+ filter.setParallel(true);
+ continue;
+ } else if (argv[i].equals("-g")) {
+ grammarFile = argv[++i];
+ continue;
+ } else if (argv[i].equals("-f")) {
+ filter.setFilter("fast");
+ continue;
+ } else if (argv[i].equals("-e")) {
+ filter.setFilter("exact");
+ continue;
+ } else if (argv[i].equals("-l")) {
+ filter.setFilter("loose");
+ continue;
+ } else if (argv[i].equals("-n")) {
+ filter.setRuleLength(Integer.parseInt(argv[i + 1]));
+ i++;
+ continue;
+ }
+
+ filter.loadTestSentences(argv[i]);
+ }
+
+ int rulesIn = 0;
+ int rulesOut = 0;
+ if (filter.verbose) {
+ System.err.println(String.format("Filtering rules with the %s filter...", filter.getFilterName()));
+// System.err.println("Using at max " + filter.RULE_LENGTH + " n-grams...");
+ }
+ LineReader reader = (grammarFile != null)
+ ? new LineReader(grammarFile, filter.verbose)
+ : new LineReader(System.in);
+ for (String rule: reader) {
+ rulesIn++;
+
+ String[] parts = P_DELIM.split(rule);
+ if (parts.length >= 4) {
+ // the source is the second field for thrax grammars, first field for phrasal ones
+ String source = rule.startsWith("[") ? parts[1].trim() : parts[0].trim();
+ if (filter.inTestSet(source)) {
+ System.out.println(rule);
+ if (filter.parallel)
+ System.out.flush();
+ rulesOut++;
+ } else if (filter.parallel) {
+ System.out.println("");
+ System.out.flush();
+ }
+ }
+ }
+ if (filter.verbose) {
+ System.err.println("[INFO] Total rules read: " + rulesIn);
+ System.err.println("[INFO] Rules kept: " + rulesOut);
+ System.err.println("[INFO] Rules dropped: " + (rulesIn - rulesOut));
+ System.err.println("[INFO] cached queries: " + filter.cached);
+ }
+
+ return;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/ui/Orientation.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/ui/Orientation.java b/joshua-core/src/main/java/org/apache/joshua/ui/Orientation.java
new file mode 100644
index 0000000..4c536ce
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/ui/Orientation.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.ui;
+
+public enum Orientation {
+ HORIZONTAL, VERTICAL
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/ui/StartupWindow.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/ui/StartupWindow.java b/joshua-core/src/main/java/org/apache/joshua/ui/StartupWindow.java
new file mode 100644
index 0000000..cccdd80
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/ui/StartupWindow.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.ui;
+
+import java.awt.BorderLayout;
+import java.awt.Color;
+import java.awt.Font;
+import java.awt.GraphicsEnvironment;
+import java.awt.Image;
+import java.awt.Point;
+
+import javax.swing.BorderFactory;
+import javax.swing.ImageIcon;
+import javax.swing.JLabel;
+import javax.swing.JPanel;
+import javax.swing.JWindow;
+
+/**
+ * Startup window for Joshua programs.
+ *
+ * @author Lane Schwartz
+ * @author Aaron Phillips
+ */
+public class StartupWindow extends JWindow {
+
+ /** Serialization identifier. */
+ private static final long serialVersionUID = 1L;
+
+ /**
+ * Constructs a splash screen.
+ *
+ * @param title Title to be displayed
+ */
+ public StartupWindow(String title) {
+ this(title, "Joshua Developers", "2010", Color.BLACK, 5);
+ }
+
+ public StartupWindow(String title, String author, String year, Image image, Color borderColor,
+ int borderWidth) {
+ JPanel content = (JPanel) getContentPane();
+ content.setBackground(Color.WHITE);
+
+ int width = 250;
+ int height = 100;
+
+ Point center = GraphicsEnvironment.getLocalGraphicsEnvironment().getCenterPoint();
+ setBounds(center.x - width / 2, center.y - height / 2, width, height);
+
+ JLabel titleLabel = new JLabel(title, JLabel.CENTER);
+ titleLabel.setFont(new Font("Sans-Serif", Font.BOLD, 24));
+ content.add(titleLabel, BorderLayout.NORTH);
+
+ JLabel copyright = new JLabel("\u24D2 " + year + " - " + author, JLabel.CENTER);
+ copyright.setFont(new Font("Sans-Serif", Font.PLAIN, 8));
+ content.add(copyright, BorderLayout.SOUTH);
+
+ if (image != null) {
+ content.add(new JLabel(new ImageIcon(image)));
+ }
+
+ content.setBorder(BorderFactory.createLineBorder(borderColor, borderWidth));
+
+ // Display it
+ setVisible(true);
+ }
+
+ public StartupWindow(String title, String author, String year, Color borderColor, int borderWidth) {
+ this(title, author, year, null, borderColor, borderWidth);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/ui/package-info.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/ui/package-info.java b/joshua-core/src/main/java/org/apache/joshua/ui/package-info.java
new file mode 100644
index 0000000..1d69516
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/ui/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/**
+ * Provides classes for visualizing parts of the translation process.
+ */
+package org.apache.joshua.ui;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationTree.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationTree.java b/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationTree.java
new file mode 100644
index 0000000..f09a40a
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationTree.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.ui.tree_visualizer;
+
+import java.util.Arrays;
+import java.util.List;
+import java.util.Collections;
+
+import org.apache.joshua.ui.tree_visualizer.tree.Tree;
+
+import edu.uci.ics.jung.graph.DirectedOrderedSparseMultigraph;
+import edu.uci.ics.jung.graph.util.EdgeType;
+import edu.uci.ics.jung.graph.util.Pair;
+
+public class DerivationTree extends DirectedOrderedSparseMultigraph<Node, DerivationTreeEdge> {
+ /**
+ * Eclipse thinks this is necessary.
+ */
+ private static final long serialVersionUID = 2914449263979566324L;
+
+ public final Node root;
+ public final Node sourceRoot;
+
+ public DerivationTree(Tree t, String source) {
+ final Tree.Node treeRoot = t.root();
+ final String rootLabel = treeRoot.label();
+ root = new Node(rootLabel, false);
+ sourceRoot = new Node(rootLabel, true);
+ addVertex(root);
+ addVertex(sourceRoot);
+ addSubtreeRootedAt(root, treeRoot);
+ final String[] sourceWords = source.split("\\s+");
+ addSourceSubtreeRootedAt(sourceRoot, treeRoot, 0, sourceWords.length, sourceWords);
+ }
+
+ private void addSubtreeRootedAt(Node n, Tree.Node tn) {
+ for (Tree.Node child : tn.children()) {
+ Node childNode = new Node(child.label(), false);
+ addVertex(childNode);
+ addEdge(new DerivationTreeEdge(false), new Pair<Node>(n, childNode), EdgeType.DIRECTED);
+ addSubtreeRootedAt(childNode, child);
+ }
+ }
+
+ private void addSourceSubtreeRootedAt(Node n, Tree.Node tn, int firstIndex, int lastIndex,
+ String[] sourceWords) {
+ int nextUncoveredIndex = firstIndex;
+ Tree.NodeSourceStartComparator cmp = new Tree.NodeSourceStartComparator();
+ List<Tree.Node> children = tn.children();
+ Collections.sort(children, cmp);
+ for (Tree.Node child : children) {
+ if (child.isLeaf()) {
+ continue;
+ }
+ int sourceStartIndex = child.sourceStartIndex();
+ int sourceEndIndex = child.sourceEndIndex();
+ if (sourceStartIndex > nextUncoveredIndex) {
+ insertSourceLeaf(n, sourceWords, nextUncoveredIndex, sourceStartIndex);
+ }
+ Node childNode = new Node(child.label(), true);
+ addEdge(new DerivationTreeEdge(true), new Pair<Node>(n, childNode), EdgeType.DIRECTED);
+ nextUncoveredIndex = sourceEndIndex;
+ addSourceSubtreeRootedAt(childNode, child, sourceStartIndex, sourceEndIndex, sourceWords);
+ }
+ if (nextUncoveredIndex < lastIndex) {
+ insertSourceLeaf(n, sourceWords, nextUncoveredIndex, lastIndex);
+ }
+ }
+
+ private void insertSourceLeaf(Node n, String[] words, int start, int end) {
+ final String[] leafWords = Arrays.copyOfRange(words, start, end);
+ String label = leafWords[0];
+ for (int i = 1; i < leafWords.length; i++) {
+ label += " " + leafWords[i];
+ }
+ Node childNode = new Node(label, true);
+ addEdge(new DerivationTreeEdge(true), new Pair<Node>(n, childNode), EdgeType.DIRECTED);
+ }
+
+ public void setSubtreeHighlight(Node n, boolean b) {
+ n.isHighlighted = b;
+ for (Node s : getSuccessors(n)) {
+ setSubtreeHighlight(s, b);
+ }
+ return;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationTreeEdge.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationTreeEdge.java b/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationTreeEdge.java
new file mode 100644
index 0000000..33b6b22
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationTreeEdge.java
@@ -0,0 +1,27 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.ui.tree_visualizer;
+
+public class DerivationTreeEdge {
+ public final boolean pointsToSource;
+
+ public DerivationTreeEdge(boolean pts) {
+ pointsToSource = pts;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationTreeTransformer.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationTreeTransformer.java b/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationTreeTransformer.java
new file mode 100644
index 0000000..3e4010f
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationTreeTransformer.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.ui.tree_visualizer;
+
+import java.awt.Dimension;
+import java.awt.geom.Point2D;
+
+import org.apache.commons.collections15.Transformer;
+
+import edu.uci.ics.jung.algorithms.layout.TreeLayout;
+import edu.uci.ics.jung.graph.DelegateForest;
+
+public class DerivationTreeTransformer implements Transformer<Node, Point2D> {
+ private TreeLayout<Node, DerivationTreeEdge> treeLayout;
+ private DerivationTree graph;
+ private Node root;
+ private Node sourceRoot;
+
+ private boolean isAnchored;
+ private Point2D anchorPoint;
+
+ private double Y_DIST;
+ private double X_DIST;
+
+
+ public DerivationTreeTransformer(DerivationTree t, Dimension d, boolean isAnchored) {
+ this.isAnchored = isAnchored;
+ anchorPoint = new Point2D.Double(0, 0);
+ graph = t;
+ DelegateForest<Node, DerivationTreeEdge> del = new DelegateForest<Node, DerivationTreeEdge>(t);
+ del.setRoot(t.root);
+ del.setRoot(t.sourceRoot);
+ root = t.root;
+ sourceRoot = t.sourceRoot;
+ Y_DIST = d.getHeight() / (2 * (1 + distanceToLeaf(root)));
+ int leafCount = 0;
+ for (Node n : t.getVertices()) {
+ if (t.outDegree(n) == 0) leafCount++;
+ }
+ X_DIST = d.getWidth() / leafCount;
+
+ treeLayout = new TreeLayout<Node, DerivationTreeEdge>(del, (int) Math.round(X_DIST));
+ }
+
+ public Point2D transform(Node n) {
+ double x, y;
+ Point2D t = treeLayout.transform(n);
+ if (n.isSource) {
+ x =
+ /* treeLayout.transform(root).getX() + */(t.getX()
+ - treeLayout.transform(sourceRoot).getX() + treeLayout.transform(root).getX());
+ y = Y_DIST * (distanceToLeaf(n) + 1);
+ } else {
+ x = t.getX();
+ y = Y_DIST * (-1) * distanceToLeaf(n);
+ }
+ if (isAnchored) {
+ x += anchorPoint.getX();
+ y += anchorPoint.getY();
+ }
+ return new Point2D.Double(x, y + Y_DIST * (1 + distanceToLeaf(root)));
+ }
+
+ private int distanceToLeaf(Node n) {
+ if (graph.getSuccessors(n).isEmpty()) return 0;
+ int result = 0;
+ for (Object x : graph.getSuccessors(n)) {
+ int tmp = distanceToLeaf((Node) x);
+ if (tmp > result) result = tmp;
+ }
+ return 1 + result;
+ }
+
+ public Dimension getSize() {
+ int height = (int) Math.round(2 * Y_DIST * (1 + distanceToLeaf(root)));
+ int width = (int) Math.round(2 * treeLayout.transform(root).getX());
+ Dimension ret = new Dimension(width, height);
+ return ret;
+ }
+
+ public Point2D getAnchorPosition(DerivationViewer.AnchorType type) {
+ switch (type) {
+ case ANCHOR_ROOT:
+ return transform(root);
+ case ANCHOR_LEFTMOST_LEAF:
+ Node n = root;
+ while (graph.getSuccessorCount(n) != 0)
+ n = (Node) graph.getSuccessors(n).toArray()[0];
+ return transform(n);
+ default:
+ return new Point2D.Double(0, 0);
+ }
+ }
+
+ public void setAnchorPoint(DerivationViewer.AnchorType type, Point2D viewerAnchor) {
+ Point2D oldAnchor = getAnchorPosition(type);
+ double x = viewerAnchor.getX() - oldAnchor.getX();
+ double y = viewerAnchor.getY() - oldAnchor.getY();
+ anchorPoint = new Point2D.Double(x, y);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationViewer.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationViewer.java b/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationViewer.java
new file mode 100644
index 0000000..8c6151d
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationViewer.java
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.ui.tree_visualizer;
+
+import java.awt.BasicStroke;
+import java.awt.Color;
+import java.awt.Dimension;
+import java.awt.Paint;
+import java.awt.Shape;
+import java.awt.Stroke;
+import java.awt.geom.Point2D;
+import java.awt.geom.Rectangle2D;
+
+import javax.swing.JLabel;
+
+import org.apache.commons.collections15.Transformer;
+
+import edu.uci.ics.jung.algorithms.layout.CircleLayout;
+import edu.uci.ics.jung.algorithms.layout.StaticLayout;
+import edu.uci.ics.jung.visualization.VisualizationViewer;
+import edu.uci.ics.jung.visualization.control.DefaultModalGraphMouse;
+import edu.uci.ics.jung.visualization.control.LayoutScalingControl;
+import edu.uci.ics.jung.visualization.control.ModalGraphMouse;
+import edu.uci.ics.jung.visualization.decorators.ToStringLabeller;
+import edu.uci.ics.jung.visualization.renderers.Renderer.VertexLabel.Position;
+
+@SuppressWarnings("serial")
+public class DerivationViewer extends VisualizationViewer<Node, DerivationTreeEdge> {
+ public static final int DEFAULT_HEIGHT = 500;
+ public static final int DEFAULT_WIDTH = 500;
+ public static final Color SRC = Color.WHITE;
+ private Color TGT;
+
+ public static final Color HIGHLIGHT = Color.pink;
+
+ public static enum AnchorType {
+ ANCHOR_ROOT, ANCHOR_LEFTMOST_LEAF
+ };
+
+ private AnchorType anchorStyle;
+ private Point2D anchorPoint;
+
+ public DerivationViewer(DerivationTree g, Dimension d, Color targetColor, AnchorType anchor) {
+ super(new CircleLayout<Node, DerivationTreeEdge>(g));
+ anchorStyle = anchor;
+ DerivationTreeTransformer dtt = new DerivationTreeTransformer(g, d, false);
+ StaticLayout<Node, DerivationTreeEdge> derivationLayout =
+ new StaticLayout<Node, DerivationTreeEdge>(g, dtt);
+ // derivationLayout.setSize(dtt.getSize());
+ setGraphLayout(derivationLayout);
+ scaleToLayout(new LayoutScalingControl());
+ // g.addCorrespondences();
+ setPreferredSize(new Dimension(DEFAULT_HEIGHT, DEFAULT_WIDTH));
+ getRenderContext().setVertexLabelTransformer(new ToStringLabeller<Node>());
+
+ DefaultModalGraphMouse<Node, DerivationTreeEdge> graphMouse =
+ new DefaultModalGraphMouse<Node, DerivationTreeEdge>();
+ graphMouse.setMode(ModalGraphMouse.Mode.TRANSFORMING);
+ setGraphMouse(graphMouse);
+ addKeyListener(graphMouse.getModeKeyListener());
+ // this.setPickedVertexState(new DerivationTreePickedState(g));
+
+ getRenderContext().setVertexFillPaintTransformer(vp);
+ getRenderContext().setEdgeStrokeTransformer(es);
+ getRenderContext().setVertexShapeTransformer(ns);
+ getRenderer().getVertexLabelRenderer().setPosition(Position.CNTR);
+
+ TGT = targetColor;
+ anchorPoint = dtt.getAnchorPosition(anchorStyle);
+ }
+
+ public void setGraph(DerivationTree tree) {
+ DerivationTreeTransformer dtt = new DerivationTreeTransformer(tree, getSize(), true);
+ dtt.setAnchorPoint(anchorStyle, anchorPoint);
+ setGraphLayout(new StaticLayout<Node, DerivationTreeEdge>(tree, dtt));
+ }
+
+ private Transformer<Node, Paint> vp = new Transformer<Node, Paint>() {
+ public Paint transform(Node n) {
+ if (n.isHighlighted) return HIGHLIGHT;
+ if (n.isSource)
+ return SRC;
+ else
+ return TGT;
+ }
+ };
+
+ private static Transformer<DerivationTreeEdge, Stroke> es =
+ new Transformer<DerivationTreeEdge, Stroke>() {
+ public Stroke transform(DerivationTreeEdge e) {
+ if (e.pointsToSource) {
+ return new BasicStroke(1.0f,
+ BasicStroke.CAP_BUTT,
+ BasicStroke.JOIN_MITER,
+ 10.0f,
+ new float[] {10.0f},
+ 0.0f);
+ } else {
+ return new BasicStroke(1.0f);
+ }
+ }
+ };
+
+ private static Transformer<Node, Shape> ns = new Transformer<Node, Shape>() {
+ public Shape transform(Node n) {
+ JLabel x = new JLabel();
+ double len = x.getFontMetrics(x.getFont()).stringWidth(n.toString());
+ double margin = 5.0;
+ return new Rectangle2D.Double((len + margin) / (-2), 0, len + 2 * margin, 20);
+ }
+ };
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationViewerApplet.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationViewerApplet.java b/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationViewerApplet.java
new file mode 100644
index 0000000..d6e7a35
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/DerivationViewerApplet.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.ui.tree_visualizer;
+
+import java.awt.Color;
+
+import javax.swing.JApplet;
+
+import org.apache.joshua.ui.tree_visualizer.tree.Tree;
+
+/**
+ * An applet for viewing DerivationTrees. It consists of a DerivationViewer inside of the applet's
+ * Panel.
+ *
+ * @author Jonathan Weese
+ *
+ */
+@SuppressWarnings("serial")
+public class DerivationViewerApplet extends JApplet {
+ /**
+ * Initializes the applet by getting the source sentence and the tree representation from the
+ * applet tag in a web page.
+ */
+ public void init() {
+ String source = getParameter("sourceSentence");
+ String derivation = getParameter("derivationTree");
+ Tree tree = new Tree(derivation);
+
+ add(new DerivationViewer(new DerivationTree(tree, source),
+ getSize(),
+ Color.red,
+ DerivationViewer.AnchorType.ANCHOR_ROOT));
+ return;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/Node.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/Node.java b/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/Node.java
new file mode 100644
index 0000000..2ffeb06
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/ui/tree_visualizer/Node.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.ui.tree_visualizer;
+
+/**
+ * A representation of a node in a derivation tree. The derivation tree class itself is
+ * parameterized in terms of this class and the <code>DerivationEdge</code> class. A
+ * <code>Node</code> may represent either a non-terminal symbol or one or more terminal symbols of
+ * the derivation.
+ */
+public class Node {
+ /**
+ * The label to be shown on the node. If the node is a non-terminal symbol, it is the name of the
+ * symbol. Otherwise, it is terminal symbols joined with spaces.
+ */
+ public final String label;
+
+ /**
+ * Indicates whether this node is part of the source-side of target- side derivation tree.
+ */
+ public final boolean isSource;
+
+ /**
+ * A boolean to let the renderer know whether this vertex is highlighted.
+ */
+ public boolean isHighlighted = false;
+
+ /**
+ * Constructor used for root nodes or nodes whose parent is not given.
+ *
+ * @param label a <code>String</code> that represents the symbols at this node
+ * @param isSource a boolean saying whether this is a source-side node
+ */
+ public Node(String label, boolean isSource) {
+ this.label = label;
+ this.isSource = isSource;
+ }
+
+ @Override
+ public String toString() {
+ return label;
+ }
+}