You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by le...@apache.org on 2016/05/16 06:26:19 UTC

[03/66] [partial] incubator-joshua git commit: JOSHUA-252 Make it possible to use Maven to build Joshua

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/server/ServerThread.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/server/ServerThread.java b/src/main/java/org/apache/joshua/server/ServerThread.java
new file mode 100644
index 0000000..ac0390b
--- /dev/null
+++ b/src/main/java/org/apache/joshua/server/ServerThread.java
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package joshua.server;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.io.StringReader;
+import java.net.Socket;
+import java.net.SocketException;
+import java.net.URLDecoder;
+import java.nio.charset.Charset;
+import java.util.HashMap;
+
+import com.sun.net.httpserver.HttpExchange;
+import com.sun.net.httpserver.HttpHandler;
+
+import joshua.decoder.Decoder;
+import joshua.decoder.JoshuaConfiguration;
+import joshua.decoder.io.TranslationRequestStream;
+
+/**
+ * This class handles a concurrent request for translations from a newly opened socket.
+ */
+public class ServerThread extends Thread implements HttpHandler {
+  private static final Charset FILE_ENCODING = Charset.forName("UTF-8");
+  
+  private final JoshuaConfiguration joshuaConfiguration;
+  private Socket socket = null;
+  private final Decoder decoder;
+
+  /**
+   * Creates a new TcpServerThread that can run a set of translations.
+   * 
+   * @param socket the socket representing the input/output streams
+   * @param decoder the configured decoder that handles performing translations
+   */
+  public ServerThread(Socket socket, Decoder decoder, JoshuaConfiguration joshuaConfiguration) {
+    this.joshuaConfiguration = joshuaConfiguration;
+    this.socket = socket;
+    this.decoder = decoder;
+  }
+
+  /**
+   * Reads the input from the socket, submits the input to the decoder, transforms the resulting
+   * translations into the required output format, writes out the formatted output, then closes the
+   * socket.
+   */
+  @Override
+  public void run() {
+
+    try {
+      BufferedReader reader = new BufferedReader(new InputStreamReader(socket.getInputStream(), FILE_ENCODING));
+
+      TranslationRequestStream request = new TranslationRequestStream(reader, joshuaConfiguration);
+
+      try {
+        decoder.decodeAll(request, socket.getOutputStream());
+
+      } catch (SocketException e) {
+        System.err.println("* WARNING: Socket interrupted");
+        request.shutdown();
+        return;
+      }
+      reader.close();
+      socket.close();
+    } catch (IOException e) {
+      return;
+    }
+  }
+  
+  public HashMap<String, String> queryToMap(String query){
+    HashMap<String, String> result = new HashMap<String, String>();
+    for (String param : query.split("&")) {
+        String pair[] = param.split("=");
+        if (pair.length > 1) {
+            result.put(pair[0], pair[1]);
+        } else {
+            result.put(pair[0], "");
+        }
+    }
+    return result;
+  } 
+
+  private class HttpWriter extends OutputStream {
+
+    private HttpExchange client = null;
+    private OutputStream out = null;
+    
+    public HttpWriter(HttpExchange client) {
+      this.client = client;
+    }
+    
+    @Override
+    public void write(byte[] response) throws IOException {
+      client.sendResponseHeaders(200, response.length);
+      out = client.getResponseBody();
+      out.write(response);
+      out.close();
+    }
+
+    @Override
+    public void write(int b) throws IOException {
+      out.write(b);
+    }
+  }
+      
+      
+  @Override
+  public void handle(HttpExchange client) throws IOException {
+
+    HashMap<String, String> params = queryToMap(URLDecoder.decode(client.getRequestURI().getQuery(), "UTF-8"));
+    String query = params.get("q");
+    
+    BufferedReader reader = new BufferedReader(new StringReader(query));
+    TranslationRequestStream request = new TranslationRequestStream(reader, joshuaConfiguration);
+    
+    decoder.decodeAll(request, new HttpWriter(client));
+    reader.close();
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/server/TcpServer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/server/TcpServer.java b/src/main/java/org/apache/joshua/server/TcpServer.java
new file mode 100644
index 0000000..2b63e72
--- /dev/null
+++ b/src/main/java/org/apache/joshua/server/TcpServer.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package joshua.server;
+
+import java.net.*;
+import java.io.*;
+
+import joshua.decoder.Decoder;
+import joshua.decoder.JoshuaConfiguration;
+
+/**
+ * TCP/IP server. Accepts newline-separated input sentences written to the socket, translates them
+ * all, and writes the resulting translations back out to the socket.
+ */
+public class TcpServer {
+  private final JoshuaConfiguration joshuaConfiguration;
+  private Decoder decoder;
+  private int port;
+
+  public TcpServer(Decoder decoder, int port,JoshuaConfiguration joshuaConfiguration) {
+    this.joshuaConfiguration = joshuaConfiguration;
+    this.decoder = decoder;
+    this.port = port;
+  }
+  
+  /**
+   * Listens on a port for new socket connections. Concurrently handles multiple socket connections.
+   * 
+   * @param args configuration options
+   * @throws IOException
+   */
+  public void start() {
+
+    try {
+      ServerSocket serverSocket = new ServerSocket(joshuaConfiguration.server_port);
+      Decoder.LOG(1, String.format("** TCP Server running and listening on port %d.", port));  
+
+      boolean listening = true;
+      while (listening)
+        new ServerThread(serverSocket.accept(), decoder, joshuaConfiguration).start();
+
+      serverSocket.close();
+
+    } catch (IOException e) {
+      System.err.println(String.format("Could not listen on port: %d.", joshuaConfiguration.server_port));
+      System.exit(-1);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/subsample/AlignedSubsampler.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/subsample/AlignedSubsampler.java b/src/main/java/org/apache/joshua/subsample/AlignedSubsampler.java
new file mode 100644
index 0000000..37480d7
--- /dev/null
+++ b/src/main/java/org/apache/joshua/subsample/AlignedSubsampler.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package joshua.subsample;
+
+import java.io.BufferedWriter;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+
+
+/**
+ * A subsampler which takes in word-alignments as well as the F and E files. To remove redundant
+ * code, this class uses callback techniques in order to "override" the superclass methods.
+ * 
+ * @see joshua.subsample.Subsampler
+ * @author wren ng thornton <wr...@users.sourceforge.net>
+ * @version $LastChangedDate$
+ */
+public class AlignedSubsampler extends Subsampler {
+
+  public AlignedSubsampler(String[] testFiles, int maxN, int targetCount) throws IOException {
+    super(testFiles, maxN, targetCount);
+  }
+
+
+  /**
+   * @param filelist list of source files to subsample from
+   * @param targetFtoERatio goal for ratio of output F length to output E length
+   * @param extf extension of F files
+   * @param exte extension of E files
+   * @param exta extension of alignment files
+   * @param fpath path to source F files
+   * @param epath path to source E files
+   * @param apath path to source alignment files
+   * @param output basename for output files (will append extensions)
+   */
+  public void subsample(String filelist, float targetFtoERatio, String extf, String exte,
+      String exta, String fpath, String epath, String apath, String output) throws IOException {
+    this.subsample(filelist, targetFtoERatio, new PhraseWriter(new BufferedWriter(
+        new OutputStreamWriter(new FileOutputStream(output + "." + extf), "UTF8")),
+        new BufferedWriter(
+            new OutputStreamWriter(new FileOutputStream(output + "." + exte), "UTF8")),
+        new BufferedWriter(
+            new OutputStreamWriter(new FileOutputStream(output + "." + exta), "UTF8"))),
+        new BiCorpusFactory(fpath, epath, apath, extf, exte, exta) { /* Local class definition */
+          public BiCorpus fromFiles(String f) throws IOException {
+            return this.alignedFromFiles(f);
+          }
+        });
+  }
+
+
+  @SuppressWarnings("static-access")
+  public static void main(String[] args) {
+    new SubsamplerCLI() { /* Local class definition */
+
+      // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+      protected final Option oa = OptionBuilder.withArgName("lang").hasArg()
+          .withDescription("Word alignment extension").isRequired().create("a");
+
+      // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+      protected final Option oapath = OptionBuilder.withArgName("path").hasArg()
+          .withDescription("Directory containing word alignment files").create("apath");
+
+      public Options getCliOptions() {
+        return super.getCliOptions().addOption(oa).addOption(oapath);
+      }
+
+      public String getClassName() {
+        return AlignedSubsampler.class.getName();
+      }
+
+      public void runSubsampler(String[] testFiles, int maxN, int targetCount, float ratio)
+          throws IOException {
+        new AlignedSubsampler(testFiles, maxN, targetCount).subsample(ot.getValue(), ratio,
+            of.getValue(), oe.getValue(), oa.getValue(), ofpath.getValue(), oepath.getValue(),
+            oapath.getValue(), ooutput.getValue());
+      }
+
+    }.runMain(args);
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/subsample/Alignment.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/subsample/Alignment.java b/src/main/java/org/apache/joshua/subsample/Alignment.java
new file mode 100644
index 0000000..9033a3e
--- /dev/null
+++ b/src/main/java/org/apache/joshua/subsample/Alignment.java
@@ -0,0 +1,84 @@
+/*
+ * This file is based on the edu.umd.clip.mt.Alignment class from the University of Maryland's
+ * umd-hadoop-mt-0.01 project. That project is released under the terms of the Apache License 2.0,
+ * but with special permission for the Joshua Machine Translation System to release modifications
+ * under the LGPL version 2.1. LGPL version 3 requires no special permission since it is compatible
+ * with Apache License 2.0
+ */
+package joshua.subsample;
+
+
+/**
+ * A set of word alignments between an F phrase and an E phrase. The implementation uses a
+ * two-dimensional bit vector, though for our purposes we could just keep the original string around
+ * (which would save lots of time parsing and reconstructing the string).
+ * 
+ * @see joshua.corpus.alignment.Alignments
+ * 
+ * @author UMD (Jimmy Lin, Chris Dyer, et al.)
+ * @author wren ng thornton <wr...@users.sourceforge.net>
+ * @version $LastChangedDate$
+ */
+public class Alignment {
+  private short eLength;
+  private short fLength;
+  private M2 aligned;
+
+  public Alignment(short fLength, short eLength, String alignments) {
+    this.eLength = eLength;
+    this.fLength = fLength;
+    this.aligned = new M2(fLength, eLength);
+
+    if (alignments == null || alignments.length() == 0) {
+      return;
+    }
+    String[] als = alignments.split("\\s+"); // TODO: joshua.util.Regex
+    for (String al : als) {
+      String[] pair = al.split("-");
+      if (pair.length != 2)
+        throw new IllegalArgumentException("Malformed alignment string: " + alignments);
+      short f = Short.parseShort(pair[0]);
+      short e = Short.parseShort(pair[1]);
+      if (f >= fLength || e >= eLength)
+        throw new IndexOutOfBoundsException("out of bounds: " + f + "," + e);
+      aligned.set(f, e);
+    }
+  }
+
+
+  public String toString() {
+    StringBuffer sb = new StringBuffer();
+    for (short i = 0; i < fLength; i++)
+      for (short j = 0; j < eLength; j++)
+        if (aligned.get(i, j)) sb.append(i).append('-').append(j).append(' ');
+
+    // Remove trailing space
+    if (sb.length() > 0) sb.delete(sb.length() - 1, sb.length());
+
+    return sb.toString();
+  }
+
+
+  /** A (short,short)->boolean map for storing alignments. */
+  private final static class M2 {
+    private short width;
+    private boolean[] bits;
+
+    public M2(short f, short e) {
+      width = f;
+      bits = new boolean[f * e];
+    }
+
+    public boolean get(short f, short e) {
+      return bits[width * e + f];
+    }
+
+    public void set(short f, short e) {
+      try {
+        bits[width * e + f] = true;
+      } catch (ArrayIndexOutOfBoundsException ee) {
+        throw new RuntimeException("Set(" + f + ", " + e + "): caught " + ee);
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/subsample/BiCorpus.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/subsample/BiCorpus.java b/src/main/java/org/apache/joshua/subsample/BiCorpus.java
new file mode 100644
index 0000000..83cba63
--- /dev/null
+++ b/src/main/java/org/apache/joshua/subsample/BiCorpus.java
@@ -0,0 +1,172 @@
+/*
+ * This file is based on the edu.umd.clip.mt.subsample.BiCorpus class from the University of
+ * Maryland's jmtTools project (in conjunction with the umd-hadoop-mt-0.01 project). That project is
+ * released under the terms of the Apache License 2.0, but with special permission for the Joshua
+ * Machine Translation System to release modifications under the LGPL version 2.1. LGPL version 3
+ * requires no special permission since it is compatible with Apache License 2.0
+ */
+package joshua.subsample;
+
+import java.io.BufferedReader;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+import joshua.corpus.Phrase;
+
+
+/**
+ * Class for representing a sentence-aligned bi-corpus (with optional word-alignments).
+ * <p>
+ * In order to avoid memory crashes we no longer extend an ArrayList, which tries to cache the
+ * entire file in memory at once. This means we'll re-read through each file (1 +
+ * {@link Subsampler#MAX_SENTENCE_LENGTH} / binsize) times where binsize is determined by the
+ * <code>subsample(String, float, PhraseWriter, BiCorpusFactory)</code> method.
+ * 
+ * @author UMD (Jimmy Lin, Chris Dyer, et al.)
+ * @author wren ng thornton <wr...@users.sourceforge.net>
+ * @version $LastChangedDate$
+ */
+public class BiCorpus implements Iterable<PhrasePair> {
+  // Making these final requires Java6, doesn't work in Java5
+  protected final String foreignFileName;
+  protected final String nativeFileName;
+  protected final String alignmentFileName;
+
+  // ===============================================================
+  // Constructors
+  // ===============================================================
+  /**
+   * Constructor for unaligned BiCorpus.
+   */
+  public BiCorpus(String foreignFileName, String nativeFileName) throws IOException {
+    this(foreignFileName, nativeFileName, null);
+  }
+
+
+  /**
+   * Constructor for word-aligned BiCorpus.
+   */
+  public BiCorpus(String foreignFileName, String nativeFileName, String alignmentFileName)
+      throws IOException, IllegalArgumentException, IndexOutOfBoundsException {
+    this.foreignFileName = foreignFileName;
+    this.nativeFileName = nativeFileName;
+    this.alignmentFileName = alignmentFileName;
+
+    // Check for fileLengthMismatchException
+    // Of course, that will be checked for in each iteration
+    //
+    // We write it this way to avoid warnings from the foreach style loop
+    Iterator<PhrasePair> it = iterator();
+    while (it.hasNext()) {
+      it.next();
+    }
+  }
+
+
+  // ===============================================================
+  // Methods
+  // ===============================================================
+  // BUG: We don't close file handles. The other reader classes apparently have finalizers to handle
+  // this well enough for our purposes, but we should migrate to using joshua.util.io.LineReader and
+  // be sure to close it in the end.
+
+  // We're not allowed to throw exceptions from Iterator/Iterable
+  // so we have evil boilerplate to crash the system
+  /**
+   * Iterate through the files represented by this <code>BiCorpus</code>, returning a
+   * {@link PhrasePair} for each pair (or triple) of lines.
+   */
+  @SuppressWarnings("resource")
+  public Iterator<PhrasePair> iterator() {
+    PhraseReader closureRF = null;
+    PhraseReader closureRE = null;
+    BufferedReader closureRA = null;
+    try {
+      closureRF = new PhraseReader(new FileReader(this.foreignFileName), (byte) 1);
+      closureRE = new PhraseReader(new FileReader(this.nativeFileName), (byte) 0);
+      closureRA =
+          (null == this.alignmentFileName ? null : new BufferedReader(new FileReader(
+              this.alignmentFileName)));
+    } catch (FileNotFoundException e) {
+      throw new RuntimeException("File not found", e);
+    }
+    // Making final for closure capturing in the local class definition
+    final PhraseReader rf = closureRF;
+    final PhraseReader re = closureRE;
+    final BufferedReader ra = closureRA;
+
+    return new Iterator<PhrasePair>() { /* Local class definition */
+      private Phrase nextForeignPhrase = null;
+
+      public void remove() {
+        throw new UnsupportedOperationException();
+      }
+
+      public boolean hasNext() {
+        if (null == this.nextForeignPhrase) {
+          try {
+            this.nextForeignPhrase = rf.readPhrase();
+          } catch (IOException e) {
+            throw new RuntimeException("IOException", e);
+          }
+        }
+        return null != this.nextForeignPhrase;
+      }
+
+      public PhrasePair next() {
+        if (this.hasNext()) {
+          Phrase f = this.nextForeignPhrase;
+
+          Phrase e = null;
+          try {
+            e = re.readPhrase();
+          } catch (IOException ioe) {
+            throw new RuntimeException("IOException", ioe);
+          }
+          if (null == e) {
+            fileLengthMismatchException();
+            return null; // Needed to make javac happy
+          } else {
+            if (e.size() != 0 && f.size() != 0) {
+              if (null != ra) {
+                String line = null;
+                try {
+                  line = ra.readLine();
+                } catch (IOException ioe) {
+                  throw new RuntimeException("IOException", ioe);
+                }
+
+                if (null == line) {
+                  fileLengthMismatchException();
+                  return null; // Needed to make javac happy
+                } else {
+                  Alignment a = new Alignment((short) f.size(), (short) e.size(), line);
+
+                  this.nextForeignPhrase = null;
+                  return new PhrasePair(f, e, a);
+                }
+              } else {
+                this.nextForeignPhrase = null;
+                return new PhrasePair(f, e);
+              }
+            } else {
+              // Inverted while loop
+              this.nextForeignPhrase = null;
+              return this.next();
+            }
+          }
+        } else {
+          throw new NoSuchElementException();
+        }
+      }
+    }; /* End local class definition */
+  } /* end iterator() */
+
+
+  private static void fileLengthMismatchException() throws RuntimeException {
+    throw new RuntimeException("Mismatched file lengths!");
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/subsample/BiCorpusFactory.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/subsample/BiCorpusFactory.java b/src/main/java/org/apache/joshua/subsample/BiCorpusFactory.java
new file mode 100644
index 0000000..eea8937
--- /dev/null
+++ b/src/main/java/org/apache/joshua/subsample/BiCorpusFactory.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package joshua.subsample;
+
+import java.io.File;
+import java.io.IOException;
+
+
+/**
+ * A callback closure for <code>Subsampler.subsample</code>. This class is used by
+ * {@link AlignedSubsampler} in order to "override" methods of {@link Subsampler}, minimizing code
+ * duplication.
+ * 
+ * @author wren ng thornton <wr...@users.sourceforge.net>
+ * @version $LastChangedDate$
+ */
+public class BiCorpusFactory {
+  // Making these final requires Java6, doesn't work in Java5
+  protected final String fpath;
+  protected final String epath;
+  protected final String apath;
+  protected final String extf;
+  protected final String exte;
+  protected final String exta;
+
+  public BiCorpusFactory(String fpath, String epath, String apath, String extf, String exte,
+      String exta) {
+    // The various concatenation has been moved up here
+    // to get it out of the loops where fromFiles is called.
+    this.fpath = (fpath == null ? "." : fpath) + File.separator;
+    this.epath = (epath == null ? "." : epath) + File.separator;
+    this.apath = (apath == null ? "." : apath) + File.separator;
+    this.extf = "." + extf;
+    this.exte = "." + exte;
+    this.exta = (exta == null ? null : "." + exta);
+  }
+
+
+  /** Generate unaligned BiCorpus by default. */
+  public BiCorpus fromFiles(String f) throws IOException {
+    return this.unalignedFromFiles(f);
+  }
+
+  /** Generate unaligned BiCorpus. */
+  public BiCorpus unalignedFromFiles(String f) throws IOException {
+    return new BiCorpus(fpath + f + extf, epath + f + exte);
+  }
+
+  /** Generate aligned BiCorpus. */
+  public BiCorpus alignedFromFiles(String f) throws IOException {
+    return new BiCorpus(fpath + f + extf, epath + f + exte, apath + f + exta);
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/subsample/PhrasePair.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/subsample/PhrasePair.java b/src/main/java/org/apache/joshua/subsample/PhrasePair.java
new file mode 100644
index 0000000..36a1da5
--- /dev/null
+++ b/src/main/java/org/apache/joshua/subsample/PhrasePair.java
@@ -0,0 +1,64 @@
+/*
+ * This file is based on the edu.umd.clip.mt.PhrasePair class from the University of Maryland's
+ * umd-hadoop-mt-0.01 project. That project is released under the terms of the Apache License 2.0,
+ * but with special permission for the Joshua Machine Translation System to release modifications
+ * under the LGPL version 2.1. LGPL version 3 requires no special permission since it is compatible
+ * with Apache License 2.0
+ */
+package joshua.subsample;
+
+// TODO: if we generalize the Alignment class, we could move this
+// to joshua.util.sentence.
+
+import joshua.corpus.Phrase;
+
+
+/**
+ * Phrase-aligned tuple class associating an F phrase, E phrase, and (possibly null)
+ * word-alignments. This is primarily for maintaining sentence-alignment.
+ * 
+ * @author UMD (Jimmy Lin, Chris Dyer, et al.)
+ * @author wren ng thornton <wr...@users.sourceforge.net>
+ * @version $LastChangedDate$
+ */
+public class PhrasePair {
+  // Making these final requires Java6, not Java5
+  private final Phrase f;
+  private final Phrase e;
+  private final Alignment a;
+
+  // ===============================================================
+  // Constructors
+  // ===============================================================
+  public PhrasePair(Phrase f_, Phrase e_) {
+    this(f_, e_, null);
+  }
+
+  public PhrasePair(Phrase f, Phrase e, Alignment a) {
+    this.f = f;
+    this.e = e;
+    this.a = a;
+  }
+
+  // ===============================================================
+  // Attributes
+  // ===============================================================
+  public Phrase getF() {
+    return f;
+  }
+
+  public Phrase getE() {
+    return e;
+  }
+
+  public Alignment getAlignment() {
+    return a;
+  }
+
+  // ===============================================================
+  // Methods
+  // ===============================================================
+  public float ratioFtoE() {
+    return ((float) this.f.size()) / ((float) this.e.size());
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/subsample/PhraseReader.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/subsample/PhraseReader.java b/src/main/java/org/apache/joshua/subsample/PhraseReader.java
new file mode 100644
index 0000000..f6dd6d3
--- /dev/null
+++ b/src/main/java/org/apache/joshua/subsample/PhraseReader.java
@@ -0,0 +1,36 @@
+/*
+ * This file is based on the edu.umd.clip.mt.PhraseReader class from the University of Maryland's
+ * umd-hadoop-mt-0.01 project. That project is released under the terms of the Apache License 2.0,
+ * but with special permission for the Joshua Machine Translation System to release modifications
+ * under the LGPL version 2.1. LGPL version 3 requires no special permission since it is compatible
+ * with Apache License 2.0
+ */
+package joshua.subsample;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Reader;
+
+import joshua.corpus.BasicPhrase;
+
+
+/**
+ * Wrapper class to read in each line as a BasicPhrase.
+ * 
+ * @author UMD (Jimmy Lin, Chris Dyer, et al.)
+ * @author wren ng thornton <wr...@users.sourceforge.net>
+ * @version $LastChangedDate$
+ */
+public class PhraseReader extends BufferedReader {
+  private byte language;
+
+  public PhraseReader(Reader r, byte language) {
+    super(r);
+    this.language = language;
+  }
+
+  public BasicPhrase readPhrase() throws IOException {
+    String line = super.readLine();
+    return (line == null ? null : new BasicPhrase(this.language, line));
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/subsample/PhraseWriter.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/subsample/PhraseWriter.java b/src/main/java/org/apache/joshua/subsample/PhraseWriter.java
new file mode 100644
index 0000000..16a3563
--- /dev/null
+++ b/src/main/java/org/apache/joshua/subsample/PhraseWriter.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package joshua.subsample;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+
+
+/**
+ * A PhrasePair-parallel BufferedWriter. In an ideal world we could get the compiler to inline all
+ * of this, to have zero-overhead while not duplicating code. Alas, Java's not that cool. The
+ * "final" could help on JIT at least.
+ * 
+ * @author wren ng thornton <wr...@users.sourceforge.net>
+ * @version $LastChangedDate$
+ */
+final public class PhraseWriter {
+  // Making these final requires Java6, not Java5
+  private final BufferedWriter wf;
+  private final BufferedWriter we;
+  private final BufferedWriter wa;
+
+  // ===============================================================
+  // Constructors
+  // ===============================================================
+  public PhraseWriter(BufferedWriter wf_, BufferedWriter we_) {
+    this(wf_, we_, null);
+  }
+
+  public PhraseWriter(BufferedWriter wf, BufferedWriter we, BufferedWriter wa) {
+    this.wf = wf;
+    this.we = we;
+    this.wa = wa;
+  }
+
+
+  // ===============================================================
+  // Methods
+  // ===============================================================
+  public void write(PhrasePair pp) throws IOException {
+    this.wf.write(pp.getF().toString());
+    this.we.write(pp.getE().toString());
+    if (null != this.wa) this.wa.write(pp.getAlignment().toString());
+  }
+
+  public void newLine() throws IOException {
+    this.wf.newLine();
+    this.we.newLine();
+    if (null != this.wa) this.wa.newLine();
+  }
+
+  public void flush() throws IOException {
+    this.wf.flush();
+    this.we.flush();
+    if (null != this.wa) this.wa.flush();
+  }
+
+  public void close() throws IOException {
+    this.wf.close();
+    this.we.close();
+    if (null != this.wa) this.wa.close();
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/subsample/Subsampler.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/subsample/Subsampler.java b/src/main/java/org/apache/joshua/subsample/Subsampler.java
new file mode 100644
index 0000000..49e1a16
--- /dev/null
+++ b/src/main/java/org/apache/joshua/subsample/Subsampler.java
@@ -0,0 +1,228 @@
+/*
+ * This file is based on the edu.umd.clip.mt.subsample.Subsampler class from the University of
+ * Maryland's jmtTools project (in conjunction with the umd-hadoop-mt-0.01 project). That project is
+ * released under the terms of the Apache License 2.0, but with special permission for the Joshua
+ * Machine Translation System to release modifications under the LGPL version 2.1. LGPL version 3
+ * requires no special permission since it is compatible with Apache License 2.0
+ */
+package joshua.subsample;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import joshua.corpus.BasicPhrase;
+import joshua.corpus.Phrase;
+
+
+/**
+ * A class for subsampling a large (F,E)-parallel sentence-aligned corpus to generate a smaller
+ * corpus whose N-grams are relevant to some seed corpus. The idea of subsampling owes to Kishore
+ * Papineni.
+ * 
+ * @author UMD (Jimmy Lin, Chris Dyer, et al.)
+ * @author wren ng thornton <wr...@users.sourceforge.net>
+ * @version $LastChangedDate$
+ */
+public class Subsampler {
+  protected Map<Phrase, Integer> ngramCounts;
+  protected int maxN;
+  protected int targetCount;
+  protected int maxSubsample = 1500000;
+
+  protected static final int MAX_SENTENCE_LENGTH = 100;
+  protected static final int MIN_RATIO_LENGTH = 10;
+
+
+  public Subsampler(String[] testFiles, int maxN, int targetCount) throws IOException {
+    this.maxN = maxN;
+    this.targetCount = targetCount;
+    this.ngramCounts = loadNgrams(testFiles);
+  }
+
+  private HashMap<Phrase, Integer> loadNgrams(String[] files) throws IOException {
+    HashMap<Phrase, Integer> map = new HashMap<Phrase, Integer>();
+    for (String fn : files) {
+      System.err.println("Loading test set from " + fn + "...");
+
+      PhraseReader reader = new PhraseReader(new FileReader(fn), (byte) 1);
+      Phrase phrase;
+      int lineCount = 0;
+      try {
+        while ((phrase = reader.readPhrase()) != null) {
+          lineCount++;
+          List<Phrase> ngrams = phrase.getSubPhrases(this.maxN);
+          for (Phrase ngram : ngrams)
+            map.put(ngram, 0);
+        }
+      } finally {
+        reader.close();
+      }
+      System.err.println("Processed " + lineCount + " lines in " + fn);
+    }
+    System.err.println("Test set: " + map.size() + " ngrams");
+    return map;
+  }
+
+
+  /**
+   * The general subsampler function for external use.
+   * 
+   * @param filelist list of source files to subsample from
+   * @param targetFtoERatio goal for ratio of output F length to output E length
+   * @param extf extension of F files
+   * @param exte extension of E files
+   * @param fpath path to source F files
+   * @param epath path to source E files
+   * @param output basename for output files (will append extensions)
+   */
+  public void subsample(String filelist, float targetFtoERatio, String extf, String exte,
+      String fpath, String epath, String output) throws IOException {
+    this.subsample(filelist, targetFtoERatio, new PhraseWriter(new BufferedWriter(
+        new OutputStreamWriter(new FileOutputStream(output + "." + extf), "UTF8")),
+        new BufferedWriter(
+            new OutputStreamWriter(new FileOutputStream(output + "." + exte), "UTF8"))),
+        new BiCorpusFactory(fpath, epath, null, extf, exte, null));
+  }
+
+  /**
+   * The main wrapper for the subsample worker. Closes the PhraseWriter before exiting.
+   */
+  protected void subsample(String filelist, float targetFtoERatio, PhraseWriter out,
+      BiCorpusFactory bcFactory) throws IOException {
+    try {
+      // Read filenames into a list
+      List<String> files = new ArrayList<String>();
+      {
+        FileReader fr = null;
+        BufferedReader br = null;
+        try {
+          fr = new FileReader(filelist);
+          br = new BufferedReader(fr);
+          String file;
+          while ((file = br.readLine()) != null) {
+            files.add(file);
+          }
+        } finally {
+          // Maybe redundant, but UMD's FixBugs says to
+          // close br (and close is idempotent anyways)
+          if (null != fr) fr.close();
+          if (null != br) br.close();
+        }
+      }
+
+      int totalSubsampled = 0;
+      // Iterating on files in order biases towards files
+      // earlier in the list
+      for (String f : files) {
+        System.err.println("Loading training data: " + f);
+
+        BiCorpus bc = bcFactory.fromFiles(f);
+
+        HashMap<PhrasePair, PhrasePair> set = new HashMap<PhrasePair, PhrasePair>();
+
+        int binsize = 10; // BUG: Magic-Number
+        int max_k = MAX_SENTENCE_LENGTH / binsize;
+        System.err.print("Looking in length range");
+        // Iterating bins from small to large biases
+        // towards short sentences
+        for (int k = 0; k < max_k; k++) {
+          System.err.print(" [" + (k * binsize + 1) + "," + ((k + 1) * binsize) + "]");
+          System.err.flush();
+
+          this.subsample(set, bc, k * binsize + 1, (k + 1) * binsize, targetFtoERatio);
+
+          if (set.size() + totalSubsampled > maxSubsample) break;
+        }
+
+        float ff = 0.0f;
+        float ef = 0.0f;
+        for (PhrasePair pp : set.keySet()) {
+          // Get pp.ratioFtoE() for all pp
+          ff += pp.getF().size();
+          ef += pp.getE().size();
+
+          out.write(set.get(pp));
+          out.newLine();
+        }
+        out.flush();
+
+        totalSubsampled += set.size();
+        System.err.println("\n  current=" + set.size() + " [total=" + totalSubsampled
+            + "]    currentRatio=" + (ff / ef));
+        System.err.flush();
+
+        // TODO: is this gc actually dubious? Or
+        // does profiling show it helps? We only
+        // do it once per file, so it's not a
+        // performance blackhole.
+        set = null;
+        bc = null;
+        System.gc();
+      }
+    } finally {
+      out.close();
+    }
+  }
+
+  /**
+   * The worker function for subsampling.
+   * 
+   * @param set The set to put selected sentences into
+   * @param bc The sentence-aligned corpus to read from
+   * @param minLength The minimum F sentence length
+   * @param maxLength The maximum F sentence length
+   * @param targetFtoERatio The desired ratio of F length to E length
+   */
+  private void subsample(HashMap<PhrasePair, PhrasePair> set, BiCorpus bc, int minLength,
+      int maxLength, float targetFtoERatio) {
+    for (PhrasePair pp : bc) {
+      PhrasePair lowercase_pp =
+          new PhrasePair(new BasicPhrase((byte) 1, pp.getF().toString().toLowerCase()),
+              new BasicPhrase((byte) 1, pp.getE().toString().toLowerCase()), pp.getAlignment());
+
+      {
+        int eLength = pp.getE().size();
+        if (eLength == 0 || eLength > MAX_SENTENCE_LENGTH) continue;
+      }
+
+      int fLength = pp.getF().size();
+      if (fLength == 0 || fLength < minLength || fLength > maxLength
+          || fLength > MAX_SENTENCE_LENGTH) continue;
+      if (fLength > 10 && targetFtoERatio != 0.0f) {
+        float ratio = pp.ratioFtoE();
+        if (fLength >= MIN_RATIO_LENGTH
+            && (ratio > 1.3f * targetFtoERatio || ratio * 1.3f < targetFtoERatio)) continue;
+      }
+      if (set.containsKey(lowercase_pp)) continue;
+
+      // at this point, length checks out and the sentence hasn't
+      // been selected yet
+
+      List<Phrase> ngrams = pp.getF().getSubPhrases(this.maxN);
+      boolean useSentence = false;
+      for (Phrase ng : ngrams) {
+        Integer count = this.ngramCounts.get(ng);
+        if (count == null) continue;
+        if (count < targetCount) {
+          useSentence = true;
+          count++;
+          this.ngramCounts.put(ng, count);
+        }
+      }
+      if (useSentence) set.put(lowercase_pp, pp);
+    }
+  }
+
+
+  public static void main(String[] args) {
+    new SubsamplerCLI().runMain(args);
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/subsample/SubsamplerCLI.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/subsample/SubsamplerCLI.java b/src/main/java/org/apache/joshua/subsample/SubsamplerCLI.java
new file mode 100644
index 0000000..ad80b74
--- /dev/null
+++ b/src/main/java/org/apache/joshua/subsample/SubsamplerCLI.java
@@ -0,0 +1,121 @@
+/*
+ * This file uses code from the edu.umd.clip.mt.subsample.Subsampler class from the University of
+ * Maryland's jmtTools project (in conjunction with the umd-hadoop-mt-0.01 project). That project is
+ * released under the terms of the Apache License 2.0, but with special permission for the Joshua
+ * Machine Translation System to release modifications under the LGPL version 2.1. LGPL version 3
+ * requires no special permission since it is compatible with Apache License 2.0
+ */
+package joshua.subsample;
+
+import java.io.IOException;
+
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+
+
+/**
+ * This class defines a callback closure to allow "overriding" the main function in subclasses of
+ * {@link Subsampler}, without duplicating code. For all subclasses, CLI <code>Options</code> should
+ * be members of the class (so they're visible to <code>runSubsampler</code> as well as
+ * <code>getCliOptions</code>), the <code>getCliOptions</code> method should be overridden to add
+ * the additional options (via <code>super</code> to keep the old options), and the
+ * <code>runSubsampler</code> method should be overridden to do the primary work for main. The
+ * <code>runMain</code> method ties everything together and should not need modification. Due to the
+ * one-use nature of subclasses of <code>SubsampleCLI</code>, they generally should be implemented
+ * as anonymous local classes.
+ * 
+ * @author wren ng thornton <wr...@users.sourceforge.net>
+ * @version $LastChangedDate$
+ */
+@SuppressWarnings("static-access")
+public class SubsamplerCLI {
+  // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+  protected final Option ot = OptionBuilder.withArgName("listfile").hasArg()
+      .withDescription("A file containing a list of training file basenames (what to sample from)")
+      .isRequired().create("training");
+
+  // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+  protected final Option otest = OptionBuilder.withArgName("file").hasArgs()
+      .withDescription("The test file (what to sample for)").isRequired().create("test");
+
+  // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+  protected final Option ooutput = OptionBuilder.withArgName("basename").hasArgs()
+      .withDescription("File basename for output training corpus").isRequired().create("output");
+
+  // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+  protected final Option of = OptionBuilder.withArgName("lang").hasArg()
+      .withDescription("Foreign language extension").isRequired().create("f");
+
+  // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+  protected final Option oe = OptionBuilder.withArgName("lang").hasArg()
+      .withDescription("Native language extension").isRequired().create("e");
+
+  // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+  protected final Option ofpath = OptionBuilder.withArgName("path").hasArg()
+      .withDescription("Directory containing foreign language files").create("fpath");
+
+  // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+  protected final Option oepath = OptionBuilder.withArgName("path").hasArg()
+      .withDescription("Directory containing native language files").create("epath");
+
+  // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+  protected final Option oratio = OptionBuilder.withArgName("ratio").hasArg()
+      .withDescription("Target F/E ratio").create("ratio");
+
+  /**
+   * Return all Options. The HelpFormatter will print them in sorted order, so it doesn't matter
+   * when we add them. Subclasses should override this method by adding more options.
+   */
+  public Options getCliOptions() {
+    return new Options().addOption(ot).addOption(otest).addOption(of).addOption(oe)
+        .addOption(ofpath).addOption(oepath).addOption(oratio).addOption(ooutput);
+  }
+
+  /**
+   * This method should be overridden to return the class used in runSubsampler.
+   */
+  public String getClassName() {
+    return Subsampler.class.getName();
+  }
+
+  /**
+   * Callback to run the subsampler. This function needs access to the variables holding each
+   * Option, thus all this closure nonsense.
+   */
+  public void runSubsampler(String[] testFiles, int maxN, int targetCount, float ratio)
+      throws IOException {
+    new Subsampler(testFiles, maxN, targetCount).subsample(ot.getValue(), ratio, of.getValue(),
+        oe.getValue(), ofpath.getValue(), oepath.getValue(), ooutput.getValue());
+  }
+
+  /**
+   * Non-static version of main so that we can define anonymous local classes to override or extend
+   * the above.
+   */
+  public void runMain(String[] args) {
+    Options o = this.getCliOptions();
+    try {
+      new GnuParser().parse(o, args);
+    } catch (ParseException pe) {
+      // The message from pe is ugly, so we omit it.
+      System.err.println("Error parsing command line");
+      new HelpFormatter().printHelp(this.getClassName(), o);
+      System.exit(1);
+    }
+
+    try {
+      float ratio = 0.8f;
+      if (this.oratio.getValue() != null) {
+        ratio = Float.parseFloat(this.oratio.getValue());
+      }
+      this.runSubsampler(this.otest.getValues(), 12, 20, ratio);
+    } catch (Exception e) {
+      e.printStackTrace();
+      System.exit(1);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/subsample/package.html
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/subsample/package.html b/src/main/java/org/apache/joshua/subsample/package.html
new file mode 100644
index 0000000..bed439c
--- /dev/null
+++ b/src/main/java/org/apache/joshua/subsample/package.html
@@ -0,0 +1,25 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
+<html>
+<head></head>
+<body bgcolor="white">
+
+<!--
+##### THIS IS THE TEMPLATE FOR THE PACKAGE DOC COMMENTS. #####
+##### TYPE YOUR PACKAGE COMMENTS HERE.  BEGIN WITH A     #####
+##### ONE-SENTENCE SUMMARY STARTING WITH A VERB LIKE:    #####
+-->
+
+Provides executables Subsampler and AlignedSubsampler, for subsampling from large training corpora based on a test corpus.
+
+<!--
+<h2>Related Documentation</h2>
+
+<ul>
+  <li>Much of the code in this package is based on .....
+</ul>
+-->
+
+<!-- Put @see and @since tags down here. -->
+
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/tools/GrammarPacker.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/tools/GrammarPacker.java b/src/main/java/org/apache/joshua/tools/GrammarPacker.java
new file mode 100644
index 0000000..33d3391
--- /dev/null
+++ b/src/main/java/org/apache/joshua/tools/GrammarPacker.java
@@ -0,0 +1,983 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package joshua.tools;
+
+import static joshua.decoder.ff.tm.packed.PackedGrammar.VOCABULARY_FILENAME;
+
+import java.io.BufferedOutputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Queue;
+import java.util.TreeMap;
+import java.util.logging.Logger;
+
+import joshua.corpus.Vocabulary;
+import joshua.util.FormatUtils;
+import joshua.util.encoding.EncoderConfiguration;
+import joshua.util.encoding.FeatureTypeAnalyzer;
+import joshua.util.encoding.IntEncoder;
+import joshua.util.io.LineReader;
+
+public class GrammarPacker {
+
+  private static final Logger logger = Logger.getLogger(GrammarPacker.class.getName());
+
+  // Size limit for slice in bytes.
+  private static int DATA_SIZE_LIMIT = (int) (Integer.MAX_VALUE * 0.8);
+  // Estimated average number of feature entries for one rule.
+  private static int DATA_SIZE_ESTIMATE = 20;
+
+  private static final String SOURCE_WORDS_SEPARATOR = " ||| ";
+
+  // Output directory name.
+  private String output;
+
+  // Input grammar to be packed.
+  private String grammar;
+
+  public String getGrammar() {
+    return grammar;
+  }
+  
+  public String getOutputDirectory() {
+    return output;
+  }
+
+  // Approximate maximum size of a slice in number of rules
+  private int approximateMaximumSliceSize;
+
+  private boolean labeled;
+
+  private boolean packAlignments;
+  private boolean grammarAlignments;
+  private String alignments;
+
+  private FeatureTypeAnalyzer types;
+  private EncoderConfiguration encoderConfig;
+
+  private String dump;
+
+  private int max_source_len;
+
+  public GrammarPacker(String grammar_filename, String config_filename, String output_filename,
+      String alignments_filename, String featuredump_filename, boolean grammar_alignments,
+      int approximateMaximumSliceSize)
+      throws IOException {
+    this.labeled = true;
+    this.grammar = grammar_filename;
+    this.output = output_filename;
+    this.dump = featuredump_filename;
+    this.grammarAlignments = grammar_alignments;
+    this.approximateMaximumSliceSize = approximateMaximumSliceSize;
+    this.max_source_len = 0;
+
+    // TODO: Always open encoder config? This is debatable.
+    this.types = new FeatureTypeAnalyzer(true);
+
+    this.alignments = alignments_filename;
+    packAlignments = grammarAlignments || (alignments != null);
+    if (!packAlignments) {
+      logger.info("No alignments file or grammar specified, skipping.");
+    } else if (alignments != null && !new File(alignments_filename).exists()) {
+      logger.severe("Alignments file does not exist: " + alignments);
+      System.exit(1);
+    }
+
+    if (config_filename != null) {
+      readConfig(config_filename);
+      types.readConfig(config_filename);
+    } else {
+      logger.info("No config specified. Attempting auto-detection of feature types.");
+    }
+    logger.info(String.format("Approximate maximum slice size (in # of rules) set to %s", approximateMaximumSliceSize));
+
+    File working_dir = new File(output);
+    working_dir.mkdir();
+    if (!working_dir.exists()) {
+      logger.severe("Failed creating output directory.");
+      System.exit(1);
+    }
+  }
+
+  private void readConfig(String config_filename) throws IOException {
+    LineReader reader = new LineReader(config_filename);
+    while (reader.hasNext()) {
+      // Clean up line, chop comments off and skip if the result is empty.
+      String line = reader.next().trim();
+      if (line.indexOf('#') != -1)
+        line = line.substring(0, line.indexOf('#'));
+      if (line.isEmpty())
+        continue;
+      String[] fields = line.split("[\\s]+");
+
+      if (fields.length < 2) {
+        logger.severe("Incomplete line in config.");
+        System.exit(1);
+      }
+      if ("slice_size".equals(fields[0])) {
+        // Number of records to concurrently load into memory for sorting.
+        approximateMaximumSliceSize = Integer.parseInt(fields[1]);
+      }
+    }
+    reader.close();
+  }
+
+  /**
+   * Executes the packing.
+   * 
+   * @throws IOException
+   */
+  public void pack() throws IOException {
+    logger.info("Beginning exploration pass.");
+    LineReader grammar_reader = null;
+    LineReader alignment_reader = null;
+
+    // Explore pass. Learn vocabulary and feature value histograms.
+    logger.info("Exploring: " + grammar);
+    grammar_reader = new LineReader(grammar);
+    explore(grammar_reader);
+
+    logger.info("Exploration pass complete. Freezing vocabulary and finalizing encoders.");
+    if (dump != null) {
+      PrintWriter dump_writer = new PrintWriter(dump);
+      dump_writer.println(types.toString());
+      dump_writer.close();
+    }
+
+    types.inferTypes(this.labeled);
+    logger.info("Type inference complete.");
+
+    logger.info("Finalizing encoding.");
+
+    logger.info("Writing encoding.");
+    types.write(output + File.separator + "encoding");
+
+    writeVocabulary();
+
+    String configFile = output + File.separator + "config";
+    logger.info(String.format("Writing config to '%s'", configFile));
+    // Write config options
+    FileWriter config = new FileWriter(configFile);
+    config.write(String.format("max-source-len = %d\n", max_source_len));
+    config.close();
+    
+    // Read previously written encoder configuration to match up to changed
+    // vocabulary id's.
+    logger.info("Reading encoding.");
+    encoderConfig = new EncoderConfiguration();
+    encoderConfig.load(output + File.separator + "encoding");
+
+    logger.info("Beginning packing pass.");
+    // Actual binarization pass. Slice and pack source, target and data.
+    grammar_reader = new LineReader(grammar);
+
+    if (packAlignments && !grammarAlignments)
+      alignment_reader = new LineReader(alignments);
+    binarize(grammar_reader, alignment_reader);
+    logger.info("Packing complete.");
+
+    logger.info("Packed grammar in: " + output);
+    logger.info("Done.");
+  }
+
+  private void explore(LineReader grammar) {
+    int counter = 0;
+    // We always assume a labeled grammar. Unlabeled features are assumed to be dense and to always
+    // appear in the same order. They are assigned numeric names in order of appearance.
+    this.types.setLabeled(true);
+
+    while (grammar.hasNext()) {
+      String line = grammar.next().trim();
+      counter++;
+      ArrayList<String> fields = new ArrayList<String>(Arrays.asList(line.split("\\s\\|{3}\\s")));
+
+      String lhs = null;
+      if (line.startsWith("[")) {
+        // hierarchical model
+        if (fields.size() < 4) {
+          logger.warning(String.format("Incomplete grammar line at line %d: '%s'", counter, line));
+          continue;
+        }
+        lhs = fields.remove(0);
+      } else {
+        // phrase-based model
+        if (fields.size() < 3) {
+          logger.warning("Incomplete phrase line at line " + counter);
+          logger.warning(line);
+          continue;
+        }
+        lhs = "[X]";
+      }
+
+      String[] source = fields.get(0).split("\\s");
+      String[] target = fields.get(1).split("\\s");
+      String[] features = fields.get(2).split("\\s");
+      
+      max_source_len = Math.max(max_source_len, source.length);
+
+      Vocabulary.id(lhs);
+      try {
+        /* Add symbols to vocabulary.
+         * NOTE: In case of nonterminals, we add both stripped versions ("[X]")
+         * and "[X,1]" to the vocabulary.
+         */
+        for (String source_word : source) {
+          Vocabulary.id(source_word);
+          if (FormatUtils.isNonterminal(source_word)) {
+            Vocabulary.id(FormatUtils.stripNonTerminalIndex(source_word));
+          }
+        }
+        for (String target_word : target) {
+          Vocabulary.id(target_word);
+          if (FormatUtils.isNonterminal(target_word)) {
+            Vocabulary.id(FormatUtils.stripNonTerminalIndex(target_word));
+          }
+        }
+      } catch (java.lang.StringIndexOutOfBoundsException e) {
+        System.err.println(String.format("* Skipping bad grammar line '%s'", line));
+        continue;
+      }
+
+      // Add feature names to vocabulary and pass the value through the
+      // appropriate encoder.
+      int feature_counter = 0;
+      for (int f = 0; f < features.length; ++f) {
+        if (features[f].contains("=")) {
+          String[] fe = features[f].split("=");
+          if (fe[0].equals("Alignment"))
+            continue;
+          types.observe(Vocabulary.id(fe[0]), Float.parseFloat(fe[1]));
+        } else {
+          types.observe(Vocabulary.id(String.valueOf(feature_counter++)),
+              Float.parseFloat(features[f]));
+        }
+      }
+    }
+  }
+
+  /**
+   * Returns a String encoding the first two source words.
+   * If there is only one source word, use empty string for the second.
+   */
+  private String getFirstTwoSourceWords(final String[] source_words) {
+    return source_words[0] + SOURCE_WORDS_SEPARATOR + ((source_words.length > 1) ? source_words[1] : "");
+  }
+
+  private void binarize(LineReader grammar_reader, LineReader alignment_reader) throws IOException {
+    int counter = 0;
+    int slice_counter = 0;
+    int num_slices = 0;
+
+    boolean ready_to_flush = false;
+    // to determine when flushing is possible
+    String prev_first_two_source_words = null;
+
+    PackingTrie<SourceValue> source_trie = new PackingTrie<SourceValue>();
+    PackingTrie<TargetValue> target_trie = new PackingTrie<TargetValue>();
+    FeatureBuffer feature_buffer = new FeatureBuffer();
+
+    AlignmentBuffer alignment_buffer = null;
+    if (packAlignments)
+      alignment_buffer = new AlignmentBuffer();
+
+    TreeMap<Integer, Float> features = new TreeMap<Integer, Float>();
+    while (grammar_reader.hasNext()) {
+      String grammar_line = grammar_reader.next().trim();
+      counter++;
+      slice_counter++;
+
+      ArrayList<String> fields = new ArrayList<String>(Arrays.asList(grammar_line.split("\\s\\|{3}\\s")));
+      String lhs_word;
+      String[] source_words;
+      String[] target_words;
+      String[] feature_entries;
+      if (grammar_line.startsWith("[")) {
+        if (fields.size() < 4)
+          continue;
+
+        lhs_word = fields.remove(0);
+        source_words = fields.get(0).split("\\s");
+        target_words = fields.get(1).split("\\s");
+        feature_entries = fields.get(2).split("\\s");
+
+      } else {
+        if (fields.size() < 3)
+          continue;
+        
+        lhs_word = "[X]";
+        String tmp = "[X,1] " + fields.get(0);
+        source_words = tmp.split("\\s");
+        tmp = "[X,1] " + fields.get(1);
+        target_words = tmp.split("\\s");
+        feature_entries = fields.get(2).split("\\s");
+      }
+
+      // Reached slice limit size, indicate that we're closing up.
+      if (!ready_to_flush
+          && (slice_counter > approximateMaximumSliceSize
+              || feature_buffer.overflowing()
+              || (packAlignments && alignment_buffer.overflowing()))) {
+        ready_to_flush = true;
+        // store the first two source words when slice size limit was reached
+        prev_first_two_source_words = getFirstTwoSourceWords(source_words);
+      }
+      // ready to flush
+      if (ready_to_flush) {
+        final String first_two_source_words = getFirstTwoSourceWords(source_words);
+        // the grammar can only be partitioned at the level of first two source word changes.
+        // Thus, we can only flush if the current first two source words differ from the ones
+        // when the slice size limit was reached.
+        if (!first_two_source_words.equals(prev_first_two_source_words)) {
+          logger.warning(String.format("ready to flush and first two words have changed (%s vs. %s)", prev_first_two_source_words, first_two_source_words));
+          logger.info(String.format("flushing %d rules to slice.", slice_counter));
+          flush(source_trie, target_trie, feature_buffer, alignment_buffer, num_slices);
+          source_trie.clear();
+          target_trie.clear();
+          feature_buffer.clear();
+          if (packAlignments)
+            alignment_buffer.clear();
+
+          num_slices++;
+          slice_counter = 0;
+          ready_to_flush = false;
+        }
+      }
+
+      int alignment_index = -1;
+      // If present, process alignments.
+      if (packAlignments) {
+        String alignment_line;
+        if (grammarAlignments) {
+          alignment_line = fields.get(3);
+        } else {
+          if (!alignment_reader.hasNext()) {
+            logger.severe("No more alignments starting in line " + counter);
+            throw new RuntimeException("No more alignments starting in line " + counter);
+          }
+          alignment_line = alignment_reader.next().trim();
+        }
+        String[] alignment_entries = alignment_line.split("\\s");
+        byte[] alignments = new byte[alignment_entries.length * 2];
+        if (alignment_entries.length != 0) {
+          for (int i = 0; i < alignment_entries.length; i++) {
+            String[] parts = alignment_entries[i].split("-");
+            alignments[2 * i] = Byte.parseByte(parts[0]);
+            alignments[2 * i + 1] = Byte.parseByte(parts[1]);
+          }
+        }
+        alignment_index = alignment_buffer.add(alignments);
+      }
+
+      // Process features.
+      // Implicitly sort via TreeMap, write to data buffer, remember position
+      // to pass on to the source trie node.
+      features.clear();
+      int feature_count = 0;
+      for (int f = 0; f < feature_entries.length; ++f) {
+        String feature_entry = feature_entries[f];
+        int feature_id;
+        float feature_value; 
+        if (feature_entry.contains("=")) {
+          String[] parts = feature_entry.split("=");
+          if (parts[0].equals("Alignment"))
+            continue;
+          feature_id = Vocabulary.id(parts[0]);
+          feature_value = Float.parseFloat(parts[1]);
+        } else {
+          feature_id = Vocabulary.id(String.valueOf(feature_count++));
+          feature_value = Float.parseFloat(feature_entry);
+        }
+        if (feature_value != 0)
+          features.put(encoderConfig.innerId(feature_id), feature_value);
+      }
+      int features_index = feature_buffer.add(features);
+
+      // Sanity check on the data block index.
+      if (packAlignments && features_index != alignment_index) {
+        logger.severe("Block index mismatch between features (" + features_index
+            + ") and alignments (" + alignment_index + ").");
+        throw new RuntimeException("Data block index mismatch.");
+      }
+
+      // Process source side.
+      SourceValue sv = new SourceValue(Vocabulary.id(lhs_word), features_index);
+      int[] source = new int[source_words.length];
+      for (int i = 0; i < source_words.length; i++) {
+        if (FormatUtils.isNonterminal(source_words[i]))
+          source[i] = Vocabulary.id(FormatUtils.stripNonTerminalIndex(source_words[i]));
+        else
+          source[i] = Vocabulary.id(source_words[i]);
+      }
+      source_trie.add(source, sv);
+
+      // Process target side.
+      TargetValue tv = new TargetValue(sv);
+      int[] target = new int[target_words.length];
+      for (int i = 0; i < target_words.length; i++) {
+        if (FormatUtils.isNonterminal(target_words[i])) {
+          target[target_words.length - (i + 1)] = -FormatUtils.getNonterminalIndex(target_words[i]);
+        } else {
+          target[target_words.length - (i + 1)] = Vocabulary.id(target_words[i]);
+        }
+      }
+      target_trie.add(target, tv);
+    }
+    // flush last slice and clear buffers
+    flush(source_trie, target_trie, feature_buffer, alignment_buffer, num_slices);
+  }
+
+  /**
+   * Serializes the source, target and feature data structures into interlinked binary files. Target
+   * is written first, into a skeletal (node don't carry any data) upward-pointing trie, updating
+   * the linking source trie nodes with the position once it is known. Source and feature data are
+   * written simultaneously. The source structure is written into a downward-pointing trie and
+   * stores the rule's lhs as well as links to the target and feature stream. The feature stream is
+   * prompted to write out a block
+   * 
+   * @param source_trie
+   * @param target_trie
+   * @param feature_buffer
+   * @param id
+   * @throws IOException
+   */
+  private void flush(PackingTrie<SourceValue> source_trie,
+      PackingTrie<TargetValue> target_trie, FeatureBuffer feature_buffer,
+      AlignmentBuffer alignment_buffer, int id) throws IOException {
+    // Make a slice object for this piece of the grammar.
+    PackingFileTuple slice = new PackingFileTuple("slice_" + String.format("%05d", id));
+    // Pull out the streams for source, target and data output.
+    DataOutputStream source_stream = slice.getSourceOutput();
+    DataOutputStream target_stream = slice.getTargetOutput();
+    DataOutputStream target_lookup_stream = slice.getTargetLookupOutput();
+    DataOutputStream feature_stream = slice.getFeatureOutput();
+    DataOutputStream alignment_stream = slice.getAlignmentOutput();
+
+    Queue<PackingTrie<TargetValue>> target_queue;
+    Queue<PackingTrie<SourceValue>> source_queue;
+
+    // The number of bytes both written into the source stream and
+    // buffered in the source queue.
+    int source_position;
+    // The number of bytes written into the target stream.
+    int target_position;
+
+    // Add trie root into queue, set target position to 0 and set cumulated
+    // size to size of trie root.
+    target_queue = new LinkedList<PackingTrie<TargetValue>>();
+    target_queue.add(target_trie);
+    target_position = 0;
+
+    // Target lookup table for trie levels.
+    int current_level_size = 1;
+    int next_level_size = 0;
+    ArrayList<Integer> target_lookup = new ArrayList<Integer>();
+
+    // Packing loop for upwards-pointing target trie.
+    while (!target_queue.isEmpty()) {
+      // Pop top of queue.
+      PackingTrie<TargetValue> node = target_queue.poll();
+      // Register that this is where we're writing the node to.
+      node.address = target_position;
+      // Tell source nodes that we're writing to this position in the file.
+      for (TargetValue tv : node.values)
+        tv.parent.target = node.address;
+      // Write link to parent.
+      if (node.parent != null)
+        target_stream.writeInt(node.parent.address);
+      else
+        target_stream.writeInt(-1);
+      target_stream.writeInt(node.symbol);
+      // Enqueue children.
+      for (int k : node.children.descendingKeySet()) {
+        PackingTrie<TargetValue> child = node.children.get(k);
+        target_queue.add(child);
+      }
+      target_position += node.size(false, true);
+      next_level_size += node.children.descendingKeySet().size();
+
+      current_level_size--;
+      if (current_level_size == 0) {
+        target_lookup.add(target_position);
+        current_level_size = next_level_size;
+        next_level_size = 0;
+      }
+    }
+    target_lookup_stream.writeInt(target_lookup.size());
+    for (int i : target_lookup)
+      target_lookup_stream.writeInt(i);
+    target_lookup_stream.close();
+
+    // Setting up for source and data writing.
+    source_queue = new LinkedList<PackingTrie<SourceValue>>();
+    source_queue.add(source_trie);
+    source_position = source_trie.size(true, false);
+    source_trie.address = target_position;
+
+    // Ready data buffers for writing.
+    feature_buffer.initialize();
+    if (packAlignments)
+      alignment_buffer.initialize();
+
+    // Packing loop for downwards-pointing source trie.
+    while (!source_queue.isEmpty()) {
+      // Pop top of queue.
+      PackingTrie<SourceValue> node = source_queue.poll();
+      // Write number of children.
+      source_stream.writeInt(node.children.size());
+      // Write links to children.
+      for (int k : node.children.descendingKeySet()) {
+        PackingTrie<SourceValue> child = node.children.get(k);
+        // Enqueue child.
+        source_queue.add(child);
+        // Child's address will be at the current end of the queue.
+        child.address = source_position;
+        // Advance cumulated size by child's size.
+        source_position += child.size(true, false);
+        // Write the link.
+        source_stream.writeInt(k);
+        source_stream.writeInt(child.address);
+      }
+      // Write number of data items.
+      source_stream.writeInt(node.values.size());
+      // Write lhs and links to target and data.
+      for (SourceValue sv : node.values) {
+        int feature_block_index = feature_buffer.write(sv.data);
+        if (packAlignments) {
+          int alignment_block_index = alignment_buffer.write(sv.data);
+          if (alignment_block_index != feature_block_index) {
+            logger.severe("Block index mismatch.");
+            throw new RuntimeException("Block index mismatch: alignment (" + alignment_block_index
+                + ") and features (" + feature_block_index + ") don't match.");
+          }
+        }
+        source_stream.writeInt(sv.lhs);
+        source_stream.writeInt(sv.target);
+        source_stream.writeInt(feature_block_index);
+      }
+    }
+    // Flush the data stream.
+    feature_buffer.flush(feature_stream);
+    if (packAlignments)
+      alignment_buffer.flush(alignment_stream);
+
+    target_stream.close();
+    source_stream.close();
+    feature_stream.close();
+    if (packAlignments)
+      alignment_stream.close();
+  }
+
+  public void writeVocabulary() throws IOException {
+    final String vocabularyFilename = output + File.separator + VOCABULARY_FILENAME;
+    logger.info("Writing vocabulary to " + vocabularyFilename);
+    Vocabulary.write(vocabularyFilename);
+  }
+
+  /**
+   * Integer-labeled, doubly-linked trie with some provisions for packing.
+   * 
+   * @author Juri Ganitkevitch
+   * 
+   * @param <D> The trie's value type.
+   */
+  class PackingTrie<D extends PackingTrieValue> {
+    int symbol;
+    PackingTrie<D> parent;
+
+    TreeMap<Integer, PackingTrie<D>> children;
+    List<D> values;
+
+    int address;
+
+    PackingTrie() {
+      address = -1;
+
+      symbol = 0;
+      parent = null;
+
+      children = new TreeMap<Integer, PackingTrie<D>>();
+      values = new ArrayList<D>();
+    }
+
+    PackingTrie(PackingTrie<D> parent, int symbol) {
+      this();
+      this.parent = parent;
+      this.symbol = symbol;
+    }
+
+    void add(int[] path, D value) {
+      add(path, 0, value);
+    }
+
+    private void add(int[] path, int index, D value) {
+      if (index == path.length)
+        this.values.add(value);
+      else {
+        PackingTrie<D> child = children.get(path[index]);
+        if (child == null) {
+          child = new PackingTrie<D>(this, path[index]);
+          children.put(path[index], child);
+        }
+        child.add(path, index + 1, value);
+      }
+    }
+
+    /**
+     * Calculate the size (in ints) of a packed trie node. Distinguishes downwards pointing (parent
+     * points to children) from upwards pointing (children point to parent) tries, as well as
+     * skeletal (no data, just the labeled links) and non-skeletal (nodes have a data block)
+     * packing.
+     * 
+     * @param downwards Are we packing into a downwards-pointing trie?
+     * @param skeletal Are we packing into a skeletal trie?
+     * 
+     * @return Number of bytes the trie node would occupy.
+     */
+    int size(boolean downwards, boolean skeletal) {
+      int size = 0;
+      if (downwards) {
+        // Number of children and links to children.
+        size = 1 + 2 * children.size();
+      } else {
+        // Link to parent.
+        size += 2;
+      }
+      // Non-skeletal packing: number of data items.
+      if (!skeletal)
+        size += 1;
+      // Non-skeletal packing: write size taken up by data items.
+      if (!skeletal && !values.isEmpty())
+        size += values.size() * values.get(0).size();
+
+      return size;
+    }
+
+    void clear() {
+      children.clear();
+      values.clear();
+    }
+  }
+
+  interface PackingTrieValue {
+    int size();
+  }
+
+  class SourceValue implements PackingTrieValue {
+    int lhs;
+    int data;
+    int target;
+
+    public SourceValue() {
+    }
+
+    SourceValue(int lhs, int data) {
+      this.lhs = lhs;
+      this.data = data;
+    }
+
+    void setTarget(int target) {
+      this.target = target;
+    }
+
+    public int size() {
+      return 3;
+    }
+  }
+
+  class TargetValue implements PackingTrieValue {
+    SourceValue parent;
+
+    TargetValue(SourceValue parent) {
+      this.parent = parent;
+    }
+
+    public int size() {
+      return 0;
+    }
+  }
+
+  abstract class PackingBuffer<T> {
+    private byte[] backing;
+    protected ByteBuffer buffer;
+
+    protected ArrayList<Integer> memoryLookup;
+    protected int totalSize;
+    protected ArrayList<Integer> onDiskOrder;
+
+    PackingBuffer() throws IOException {
+      allocate();
+      memoryLookup = new ArrayList<Integer>();
+      onDiskOrder = new ArrayList<Integer>();
+      totalSize = 0;
+    }
+
+    abstract int add(T item);
+
+    // Allocate a reasonably-sized buffer for the feature data.
+    private void allocate() {
+      backing = new byte[approximateMaximumSliceSize * DATA_SIZE_ESTIMATE];
+      buffer = ByteBuffer.wrap(backing);
+    }
+
+    // Reallocate the backing array and buffer, copies data over.
+    protected void reallocate() {
+      if (backing.length == Integer.MAX_VALUE)
+        return;
+      long attempted_length = backing.length * 2l;
+      int new_length;
+      // Detect overflow.
+      if (attempted_length >= Integer.MAX_VALUE)
+        new_length = Integer.MAX_VALUE;
+      else
+        new_length = (int) attempted_length;
+      byte[] new_backing = new byte[new_length];
+      System.arraycopy(backing, 0, new_backing, 0, backing.length);
+      int old_position = buffer.position();
+      ByteBuffer new_buffer = ByteBuffer.wrap(new_backing);
+      new_buffer.position(old_position);
+      buffer = new_buffer;
+      backing = new_backing;
+    }
+
+    /**
+     * Prepare the data buffer for disk writing.
+     */
+    void initialize() {
+      onDiskOrder.clear();
+    }
+
+    /**
+     * Enqueue a data block for later writing.
+     * 
+     * @param block_index The index of the data block to add to writing queue.
+     * @return The to-be-written block's output index.
+     */
+    int write(int block_index) {
+      onDiskOrder.add(block_index);
+      return onDiskOrder.size() - 1;
+    }
+
+    /**
+     * Performs the actual writing to disk in the order specified by calls to write() since the last
+     * call to initialize().
+     * 
+     * @param out
+     * @throws IOException
+     */
+    void flush(DataOutputStream out) throws IOException {
+      writeHeader(out);
+      int size;
+      int block_address;
+      for (int block_index : onDiskOrder) {
+        block_address = memoryLookup.get(block_index);
+        size = blockSize(block_index);
+        out.write(backing, block_address, size);
+      }
+    }
+
+    void clear() {
+      buffer.clear();
+      memoryLookup.clear();
+      onDiskOrder.clear();
+    }
+
+    boolean overflowing() {
+      return (buffer.position() >= DATA_SIZE_LIMIT);
+    }
+
+    private void writeHeader(DataOutputStream out) throws IOException {
+      if (out.size() == 0) {
+        out.writeInt(onDiskOrder.size());
+        out.writeInt(totalSize);
+        int disk_position = headerSize();
+        for (int block_index : onDiskOrder) {
+          out.writeInt(disk_position);
+          disk_position += blockSize(block_index);
+        }
+      } else {
+        throw new RuntimeException("Got a used stream for header writing.");
+      }
+    }
+
+    private int headerSize() {
+      // One integer for each data block, plus number of blocks and total size.
+      return 4 * (onDiskOrder.size() + 2);
+    }
+
+    private int blockSize(int block_index) {
+      int block_address = memoryLookup.get(block_index);
+      return (block_index < memoryLookup.size() - 1 ? memoryLookup.get(block_index + 1) : totalSize)
+          - block_address;
+    }
+  }
+
+  class FeatureBuffer extends PackingBuffer<TreeMap<Integer, Float>> {
+
+    private IntEncoder idEncoder;
+
+    FeatureBuffer() throws IOException {
+      super();
+      idEncoder = types.getIdEncoder();
+      logger.info("Encoding feature ids in: " + idEncoder.getKey());
+    }
+
+    /**
+     * Add a block of features to the buffer.
+     * 
+     * @param features TreeMap with the features for one rule.
+     * @return The index of the resulting data block.
+     */
+    int add(TreeMap<Integer, Float> features) {
+      int data_position = buffer.position();
+
+      // Over-estimate how much room this addition will need: for each
+      // feature (ID_SIZE for label, "upper bound" of 4 for the value), plus ID_SIZE for
+      // the number of features. If this won't fit, reallocate the buffer.
+      int size_estimate = (4 + EncoderConfiguration.ID_SIZE) * features.size()
+          + EncoderConfiguration.ID_SIZE;
+      if (buffer.capacity() - buffer.position() <= size_estimate)
+        reallocate();
+
+      // Write features to buffer.
+      idEncoder.write(buffer, features.size());
+      for (Integer k : features.descendingKeySet()) {
+        float v = features.get(k);
+        // Sparse features.
+        if (v != 0.0) {
+          idEncoder.write(buffer, k);
+          encoderConfig.encoder(k).write(buffer, v);
+        }
+      }
+      // Store position the block was written to.
+      memoryLookup.add(data_position);
+      // Update total size (in bytes).
+      totalSize = buffer.position();
+
+      // Return block index.
+      return memoryLookup.size() - 1;
+    }
+  }
+
+  class AlignmentBuffer extends PackingBuffer<byte[]> {
+
+    AlignmentBuffer() throws IOException {
+      super();
+    }
+
+    /**
+     * Add a rule alignments to the buffer.
+     * 
+     * @param alignments a byte array with the alignment points for one rule.
+     * @return The index of the resulting data block.
+     */
+    int add(byte[] alignments) {
+      int data_position = buffer.position();
+      int size_estimate = alignments.length + 1;
+      if (buffer.capacity() - buffer.position() <= size_estimate)
+        reallocate();
+
+      // Write alignment points to buffer.
+      buffer.put((byte) (alignments.length / 2));
+      buffer.put(alignments);
+
+      // Store position the block was written to.
+      memoryLookup.add(data_position);
+      // Update total size (in bytes).
+      totalSize = buffer.position();
+      // Return block index.
+      return memoryLookup.size() - 1;
+    }
+  }
+
+  class PackingFileTuple implements Comparable<PackingFileTuple> {
+    private File sourceFile;
+    private File targetLookupFile;
+    private File targetFile;
+
+    private File featureFile;
+    private File alignmentFile;
+
+    PackingFileTuple(String prefix) {
+      sourceFile = new File(output + File.separator + prefix + ".source");
+      targetFile = new File(output + File.separator + prefix + ".target");
+      targetLookupFile = new File(output + File.separator + prefix + ".target.lookup");
+      featureFile = new File(output + File.separator + prefix + ".features");
+
+      alignmentFile = null;
+      if (packAlignments)
+        alignmentFile = new File(output + File.separator + prefix + ".alignments");
+
+      logger.info("Allocated slice: " + sourceFile.getAbsolutePath());
+    }
+
+    DataOutputStream getSourceOutput() throws IOException {
+      return getOutput(sourceFile);
+    }
+
+    DataOutputStream getTargetOutput() throws IOException {
+      return getOutput(targetFile);
+    }
+
+    DataOutputStream getTargetLookupOutput() throws IOException {
+      return getOutput(targetLookupFile);
+    }
+
+    DataOutputStream getFeatureOutput() throws IOException {
+      return getOutput(featureFile);
+    }
+
+    DataOutputStream getAlignmentOutput() throws IOException {
+      if (alignmentFile != null)
+        return getOutput(alignmentFile);
+      return null;
+    }
+
+    private DataOutputStream getOutput(File file) throws IOException {
+      if (file.createNewFile()) {
+        return new DataOutputStream(new BufferedOutputStream(new FileOutputStream(file)));
+      } else {
+        throw new RuntimeException("File doesn't exist: " + file.getName());
+      }
+    }
+
+    long getSize() {
+      return sourceFile.length() + targetFile.length() + featureFile.length();
+    }
+
+    @Override
+    public int compareTo(PackingFileTuple o) {
+      if (getSize() > o.getSize()) {
+        return -1;
+      } else if (getSize() < o.getSize()) {
+        return 1;
+      } else {
+        return 0;
+      }
+    }
+  }
+}