You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/06/23 18:45:37 UTC

[26/60] [partial] incubator-joshua git commit: maven multi-module layout 1st commit: moving files into joshua-core

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/server/ServerThread.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/server/ServerThread.java b/joshua-core/src/main/java/org/apache/joshua/server/ServerThread.java
new file mode 100644
index 0000000..d054515
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/server/ServerThread.java
@@ -0,0 +1,294 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.server;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.io.StringReader;
+import java.net.Socket;
+import java.net.SocketException;
+import java.net.URLDecoder;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+
+import com.sun.net.httpserver.HttpExchange;
+import com.sun.net.httpserver.HttpHandler;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.Decoder;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.Translation;
+import org.apache.joshua.decoder.Translations;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.ff.tm.Trie;
+import org.apache.joshua.decoder.ff.tm.format.HieroFormatReader;
+import org.apache.joshua.decoder.io.JSONMessage;
+import org.apache.joshua.decoder.io.TranslationRequestStream;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class handles a concurrent request for translations from a newly opened socket, for
+ * both raw TCP/IP connections and for HTTP connections.
+ * 
+ */
+public class ServerThread extends Thread implements HttpHandler {
+
+  private static final Logger LOG = LoggerFactory.getLogger(ServerThread.class);
+  private static final Charset FILE_ENCODING = Charset.forName("UTF-8");
+  
+  private final JoshuaConfiguration joshuaConfiguration;
+  private Socket socket = null;
+  private final Decoder decoder;
+
+  /**
+   * Creates a new TcpServerThread that can run a set of translations.
+   * 
+   * @param socket the socket representing the input/output streams
+   * @param decoder the configured decoder that handles performing translations
+   * @param joshuaConfiguration a populated {@link org.apache.joshua.decoder.JoshuaConfiguration}
+   */
+  public ServerThread(Socket socket, Decoder decoder, JoshuaConfiguration joshuaConfiguration) {
+    this.joshuaConfiguration = joshuaConfiguration;
+    this.socket = socket;
+    this.decoder = decoder;
+  }
+
+  /**
+   * Reads the input from the socket, submits the input to the decoder, transforms the resulting
+   * translations into the required output format, writes out the formatted output, then closes the
+   * socket.
+   */
+  @Override
+  public void run() {
+
+    //TODO: use try-with-resources block
+    try {
+      BufferedReader reader = new BufferedReader(new InputStreamReader(socket.getInputStream(), FILE_ENCODING));
+
+      TranslationRequestStream request = new TranslationRequestStream(reader, joshuaConfiguration);
+
+      try {
+        Translations translations = decoder.decodeAll(request);
+        
+        OutputStream out = socket.getOutputStream();
+        
+        for (Translation translation: translations) {
+          out.write(translation.toString().getBytes());
+        }
+        
+      } catch (SocketException e) {
+        LOG.error(" Socket interrupted", e);
+        request.shutdown();
+      } finally {
+        reader.close();
+        socket.close();
+      }
+    } catch (IOException e) {
+      LOG.error(e.getMessage(), e);
+    }
+  }
+  
+  public HashMap<String, String> queryToMap(String query){
+    HashMap<String, String> result = new HashMap<String, String>();
+    for (String param : query.split("&")) {
+        String pair[] = param.split("=");
+        if (pair.length > 1) {
+            result.put(pair[0], pair[1]);
+        } else {
+            result.put(pair[0], "");
+        }
+    }
+    return result;
+  } 
+
+  private class HttpWriter extends OutputStream {
+
+    private HttpExchange client = null;
+    private OutputStream out = null;
+    
+    public HttpWriter(HttpExchange client) {
+      this.client = client;
+      client.getResponseHeaders().add("Access-Control-Allow-Origin", "*");
+    }
+    
+    @Override
+    public void write(byte[] response) throws IOException {
+      client.sendResponseHeaders(200, response.length);
+      out = client.getResponseBody();
+      out.write(response);
+      out.close();
+    }
+
+    @Override
+    public void write(int b) throws IOException {
+      out.write(b);
+    }
+  }
+
+  /**
+   * Called to handle an HTTP connection. This looks for metadata in the URL string, which is processed
+   * if present. It also then handles returning a JSON-formatted object to the caller. 
+   * 
+   * @param client the client connection
+   */
+  @Override
+  public synchronized void handle(HttpExchange client) throws IOException {
+
+    HashMap<String, String> params = queryToMap(URLDecoder.decode(client.getRequestURI().getQuery(), "UTF-8"));
+    String query = params.get("q");
+    String meta = params.get("meta");
+    
+    BufferedReader reader = new BufferedReader(new StringReader(query));
+    TranslationRequestStream request = new TranslationRequestStream(reader, joshuaConfiguration);
+    
+    Translations translations = decoder.decodeAll(request);
+    JSONMessage message = new JSONMessage();
+    if (meta != null && ! meta.isEmpty())
+      handleMetadata(meta, message);
+
+    for (Translation translation: translations) {
+      LOG.info("TRANSLATION: '{}' with {} k-best items", translation, translation.getStructuredTranslations().size());
+      message.addTranslation(translation);
+    }
+
+    OutputStream out = new HttpWriter(client);
+    out.write(message.toString().getBytes());
+    if (LOG.isDebugEnabled())
+      LOG.debug(message.toString());
+    out.close();
+    
+    reader.close();
+  }
+  
+  /**
+   * Processes metadata commands received in the HTTP request. Some commands result in sending data back.
+   *
+   * @param meta the metadata request
+   * @return result string (for some commands)
+   */
+  private void handleMetadata(String meta, JSONMessage message) {
+    String[] tokens = meta.split("\\s+", 2);
+    String type = tokens[0];
+    String args = tokens.length > 1 ? tokens[1] : "";
+    
+    if (type.equals("get_weight")) {
+      String weight = tokens[1];
+      LOG.info("WEIGHT: %s = %.3f", weight, Decoder.weights.getWeight(weight));
+
+    } else if (type.equals("set_weights")) {
+      // Change a decoder weight
+      String[] argTokens = args.split("\\s+");
+      for (int i = 0; i < argTokens.length; i += 2) {
+        String feature = argTokens[i];
+        String newValue = argTokens[i+1];
+        float old_weight = Decoder.weights.getWeight(feature);
+        Decoder.weights.set(feature, Float.parseFloat(newValue));
+        LOG.info("set_weights: {} {} -> {}", feature, old_weight, Decoder.weights.getWeight(feature));
+      }
+      
+      message.addMetaData("weights " + Decoder.weights.toString());
+      
+    } else if (type.equals("get_weights")) {
+      message.addMetaData("weights " + Decoder.weights.toString());
+      
+    } else if (type.equals("add_rule")) {
+      String argTokens[] = args.split(" \\|\\|\\| ");
+  
+      if (argTokens.length < 3) {
+        LOG.error("* INVALID RULE '{}'", meta);
+        return;
+      }
+      
+      String lhs = argTokens[0];
+      String source = argTokens[1];
+      String target = argTokens[2];
+      String featureStr = "";
+      if (argTokens.length > 3) 
+        featureStr = argTokens[3];
+          
+      /* Prepend source and target side nonterminals for phrase-based decoding. Probably better
+       * handled in each grammar type's addRule() function.
+       */
+      String ruleString = (joshuaConfiguration.search_algorithm.equals("stack"))
+          ? String.format("%s ||| [X,1] %s ||| [X,1] %s ||| custom=1 %s", lhs, source, target, featureStr)
+          : String.format("%s ||| %s ||| %s ||| custom=1 %s", lhs, source, target, featureStr);
+      
+      Rule rule = new HieroFormatReader().parseLine(ruleString);
+      decoder.addCustomRule(rule);
+      
+      LOG.info("Added custom rule {}", rule.toString());
+  
+    } else if (type.equals("list_rules")) {
+  
+      LOG.info("list_rules");
+      
+      // Walk the the grammar trie
+      ArrayList<Trie> nodes = new ArrayList<Trie>();
+      nodes.add(decoder.getCustomPhraseTable().getTrieRoot());
+  
+      while (nodes.size() > 0) {
+        Trie trie = nodes.remove(0);
+  
+        if (trie == null)
+          continue;
+  
+        if (trie.hasRules()) {
+          for (Rule rule: trie.getRuleCollection().getRules()) {
+            message.addRule(rule.toString());
+            LOG.debug("Found rule: " + rule);
+          }
+        }
+  
+        if (trie.getExtensions() != null)
+          nodes.addAll(trie.getExtensions());
+      }
+  
+    } else if (type.equals("remove_rule")) {
+      
+      Rule rule = new HieroFormatReader().parseLine(args);
+      
+      LOG.info("remove_rule " + rule);
+  
+      Trie trie = decoder.getCustomPhraseTable().getTrieRoot();
+      int[] sourceTokens = rule.getFrench();
+      for (int i = 0; i < sourceTokens.length; i++) {
+        Trie nextTrie = trie.match(sourceTokens[i]);
+        if (nextTrie == null)
+          return;
+        
+        trie = nextTrie;
+      }
+
+      if (trie.hasRules()) {
+        for (Rule ruleCand: trie.getRuleCollection().getRules()) {
+          if (Arrays.equals(rule.getEnglish(), ruleCand.getEnglish())) {
+            trie.getRuleCollection().getRules().remove(ruleCand);
+            break;
+          }
+        }
+        return;
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/server/TcpServer.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/server/TcpServer.java b/joshua-core/src/main/java/org/apache/joshua/server/TcpServer.java
new file mode 100644
index 0000000..e054186
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/server/TcpServer.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.server;
+
+import java.io.IOException;
+import java.net.ServerSocket;
+
+import org.apache.joshua.decoder.Decoder;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * TCP/IP server. Accepts newline-separated input sentences written to the socket, translates them
+ * all, and writes the resulting translations back out to the socket.
+ */
+public class TcpServer {
+  private static final Logger LOG = LoggerFactory.getLogger(TcpServer.class);
+  private final JoshuaConfiguration joshuaConfiguration;
+  private Decoder decoder;
+  private int port;
+
+  public TcpServer(Decoder decoder, int port,JoshuaConfiguration joshuaConfiguration) {
+    this.joshuaConfiguration = joshuaConfiguration;
+    this.decoder = decoder;
+    this.port = port;
+  }
+  
+  /**
+   * Listens on a port for new socket connections. Concurrently handles multiple socket connections.
+   */
+  public void start() {
+
+    try {
+      ServerSocket serverSocket = new ServerSocket(joshuaConfiguration.server_port);
+      LOG.info("** TCP Server running and listening on port {}.", port);
+
+      boolean listening = true;
+      while (listening)
+        new ServerThread(serverSocket.accept(), decoder, joshuaConfiguration).start();
+
+      serverSocket.close();
+
+    } catch (IOException e) {
+      throw new RuntimeException(String.format("Could not listen on port: %d.",
+          joshuaConfiguration.server_port));
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/subsample/AlignedSubsampler.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/subsample/AlignedSubsampler.java b/joshua-core/src/main/java/org/apache/joshua/subsample/AlignedSubsampler.java
new file mode 100644
index 0000000..2915685
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/subsample/AlignedSubsampler.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.subsample;
+
+import java.io.BufferedWriter;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+
+
+/**
+ * A subsampler which takes in word-alignments as well as the F and E files. To remove redundant
+ * code, this class uses callback techniques in order to "override" the superclass methods.
+ * 
+ * @see org.apache.joshua.subsample.Subsampler
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @version $LastChangedDate$
+ */
+public class AlignedSubsampler extends Subsampler {
+
+  public AlignedSubsampler(String[] testFiles, int maxN, int targetCount) throws IOException {
+    super(testFiles, maxN, targetCount);
+  }
+
+
+  /**
+   * @param filelist list of source files to subsample from
+   * @param targetFtoERatio goal for ratio of output F length to output E length
+   * @param extf extension of F files
+   * @param exte extension of E files
+   * @param exta extension of alignment files
+   * @param fpath path to source F files
+   * @param epath path to source E files
+   * @param apath path to source alignment files
+   * @param output basename for output files (will append extensions)
+   * @throws IOException if there is an error reading the input file(s)
+   */
+  public void subsample(String filelist, float targetFtoERatio, String extf, String exte,
+      String exta, String fpath, String epath, String apath, String output) throws IOException {
+    this.subsample(filelist, targetFtoERatio, new PhraseWriter(new BufferedWriter(
+        new OutputStreamWriter(new FileOutputStream(output + "." + extf), "UTF8")),
+        new BufferedWriter(
+            new OutputStreamWriter(new FileOutputStream(output + "." + exte), "UTF8")),
+        new BufferedWriter(
+            new OutputStreamWriter(new FileOutputStream(output + "." + exta), "UTF8"))),
+        new BiCorpusFactory(fpath, epath, apath, extf, exte, exta) { /* Local class definition */
+          public BiCorpus fromFiles(String f) throws IOException {
+            return this.alignedFromFiles(f);
+          }
+        });
+  }
+
+
+  @SuppressWarnings("static-access")
+  public static void main(String[] args) {
+    new SubsamplerCLI() { /* Local class definition */
+
+      // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+      protected final Option oa = OptionBuilder.withArgName("lang").hasArg()
+          .withDescription("Word alignment extension").isRequired().create("a");
+
+      // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+      protected final Option oapath = OptionBuilder.withArgName("path").hasArg()
+          .withDescription("Directory containing word alignment files").create("apath");
+
+      public Options getCliOptions() {
+        return super.getCliOptions().addOption(oa).addOption(oapath);
+      }
+
+      public String getClassName() {
+        return AlignedSubsampler.class.getName();
+      }
+
+      public void runSubsampler(String[] testFiles, int maxN, int targetCount, float ratio)
+          throws IOException {
+        new AlignedSubsampler(testFiles, maxN, targetCount).subsample(ot.getValue(), ratio,
+            of.getValue(), oe.getValue(), oa.getValue(), ofpath.getValue(), oepath.getValue(),
+            oapath.getValue(), ooutput.getValue());
+      }
+
+    }.runMain(args);
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/subsample/Alignment.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/subsample/Alignment.java b/joshua-core/src/main/java/org/apache/joshua/subsample/Alignment.java
new file mode 100644
index 0000000..073eb5c
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/subsample/Alignment.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.subsample;
+
+/**
+ * A set of word alignments between an F phrase and an E phrase. The implementation uses a
+ * two-dimensional bit vector, though for our purposes we could just keep the original string around
+ * (which would save lots of time parsing and reconstructing the string).
+ * 
+ * @author UMD (Jimmy Lin, Chris Dyer, et al.)
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @version $LastChangedDate$
+ */
+public class Alignment {
+  private short eLength;
+  private short fLength;
+  private M2 aligned;
+
+  public Alignment(short fLength, short eLength, String alignments) {
+    this.eLength = eLength;
+    this.fLength = fLength;
+    this.aligned = new M2(fLength, eLength);
+
+    if (alignments == null || alignments.length() == 0) {
+      return;
+    }
+    String[] als = alignments.split("\\s+"); // TODO: joshua.util.Regex
+    for (String al : als) {
+      String[] pair = al.split("-");
+      if (pair.length != 2)
+        throw new IllegalArgumentException("Malformed alignment string: " + alignments);
+      short f = Short.parseShort(pair[0]);
+      short e = Short.parseShort(pair[1]);
+      if (f >= fLength || e >= eLength)
+        throw new IndexOutOfBoundsException("out of bounds: " + f + "," + e);
+      aligned.set(f, e);
+    }
+  }
+
+
+  public String toString() {
+    StringBuffer sb = new StringBuffer();
+    for (short i = 0; i < fLength; i++)
+      for (short j = 0; j < eLength; j++)
+        if (aligned.get(i, j)) sb.append(i).append('-').append(j).append(' ');
+
+    // Remove trailing space
+    if (sb.length() > 0) sb.delete(sb.length() - 1, sb.length());
+
+    return sb.toString();
+  }
+
+
+  /** A (short,short)->boolean map for storing alignments. */
+  private final static class M2 {
+    private short width;
+    private boolean[] bits;
+
+    public M2(short f, short e) {
+      width = f;
+      bits = new boolean[f * e];
+    }
+
+    public boolean get(short f, short e) {
+      return bits[width * e + f];
+    }
+
+    public void set(short f, short e) {
+      try {
+        bits[width * e + f] = true;
+      } catch (ArrayIndexOutOfBoundsException ee) {
+        throw new RuntimeException("Set(" + f + ", " + e + "): caught " + ee);
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/subsample/BiCorpus.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/subsample/BiCorpus.java b/joshua-core/src/main/java/org/apache/joshua/subsample/BiCorpus.java
new file mode 100644
index 0000000..06ec0e9
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/subsample/BiCorpus.java
@@ -0,0 +1,186 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.subsample;
+
+import java.io.BufferedReader;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+import org.apache.joshua.corpus.Phrase;
+
+/**
+ * Class for representing a sentence-aligned bi-corpus (with optional word-alignments).
+ * <p>
+ * In order to avoid memory crashes we no longer extend an ArrayList, which tries to cache the
+ * entire file in memory at once. This means we'll re-read through each file (1 +
+ * {@link Subsampler#MAX_SENTENCE_LENGTH} / binsize) times where binsize is determined by the
+ * <code>subsample(String, float, PhraseWriter, BiCorpusFactory)</code> method.
+ * 
+ * @author UMD (Jimmy Lin, Chris Dyer, et al.)
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @version $LastChangedDate$
+ */
+public class BiCorpus implements Iterable<PhrasePair> {
+  protected final String foreignFileName;
+  protected final String nativeFileName;
+  protected final String alignmentFileName;
+
+  /**
+   * Constructor for unaligned BiCorpus.
+   * @param foreignFileName todo
+   * @param nativeFileName todo
+   * @throws IOException todo
+   */
+  public BiCorpus(String foreignFileName, String nativeFileName) throws IOException {
+    this(foreignFileName, nativeFileName, null);
+  }
+
+  /**
+   * Constructor for word-aligned BiCorpus.
+   * @param foreignFileName todo
+   * @param nativeFileName todo
+   * @param alignmentFileName todo
+   * @throws IOException todo
+   * @throws IllegalArgumentException todo
+   * @throws IndexOutOfBoundsException todo
+   */
+  public BiCorpus(String foreignFileName, String nativeFileName, String alignmentFileName)
+      throws IOException, IllegalArgumentException, IndexOutOfBoundsException {
+    this.foreignFileName = foreignFileName;
+    this.nativeFileName = nativeFileName;
+    this.alignmentFileName = alignmentFileName;
+
+    // Check for fileLengthMismatchException
+    // Of course, that will be checked for in each iteration
+    //
+    // We write it this way to avoid warnings from the foreach style loop
+    Iterator<PhrasePair> it = iterator();
+    while (it.hasNext()) {
+      it.next();
+    }
+  }
+
+
+  // ===============================================================
+  // Methods
+  // ===============================================================
+  // BUG: We don't close file handles. The other reader classes apparently have finalizers to handle
+  // this well enough for our purposes, but we should migrate to using joshua.util.io.LineReader and
+  // be sure to close it in the end.
+
+  // We're not allowed to throw exceptions from Iterator/Iterable
+  // so we have evil boilerplate to crash the system
+  /**
+   * Iterate through the files represented by this <code>BiCorpus</code>, returning a
+   * {@link PhrasePair} for each pair (or triple) of lines.
+   */
+  @SuppressWarnings("resource")
+  public Iterator<PhrasePair> iterator() {
+    PhraseReader closureRF = null;
+    PhraseReader closureRE = null;
+    BufferedReader closureRA = null;
+    try {
+      closureRF = new PhraseReader(new FileReader(this.foreignFileName), (byte) 1);
+      closureRE = new PhraseReader(new FileReader(this.nativeFileName), (byte) 0);
+      closureRA =
+          (null == this.alignmentFileName ? null : new BufferedReader(new FileReader(
+              this.alignmentFileName)));
+    } catch (FileNotFoundException e) {
+      throw new RuntimeException("File not found", e);
+    }
+    // Making final for closure capturing in the local class definition
+    final PhraseReader rf = closureRF;
+    final PhraseReader re = closureRE;
+    final BufferedReader ra = closureRA;
+
+    return new Iterator<PhrasePair>() { /* Local class definition */
+      private Phrase nextForeignPhrase = null;
+
+      public void remove() {
+        throw new UnsupportedOperationException();
+      }
+
+      public boolean hasNext() {
+        if (null == this.nextForeignPhrase) {
+          try {
+            this.nextForeignPhrase = rf.readPhrase();
+          } catch (IOException e) {
+            throw new RuntimeException("IOException", e);
+          }
+        }
+        return null != this.nextForeignPhrase;
+      }
+
+      public PhrasePair next() {
+        if (this.hasNext()) {
+          Phrase f = this.nextForeignPhrase;
+
+          Phrase e = null;
+          try {
+            e = re.readPhrase();
+          } catch (IOException ioe) {
+            throw new RuntimeException("IOException", ioe);
+          }
+          if (null == e) {
+            fileLengthMismatchException();
+            return null; // Needed to make javac happy
+          } else {
+            if (e.size() != 0 && f.size() != 0) {
+              if (null != ra) {
+                String line = null;
+                try {
+                  line = ra.readLine();
+                } catch (IOException ioe) {
+                  throw new RuntimeException("IOException", ioe);
+                }
+
+                if (null == line) {
+                  fileLengthMismatchException();
+                  return null; // Needed to make javac happy
+                } else {
+                  Alignment a = new Alignment((short) f.size(), (short) e.size(), line);
+
+                  this.nextForeignPhrase = null;
+                  return new PhrasePair(f, e, a);
+                }
+              } else {
+                this.nextForeignPhrase = null;
+                return new PhrasePair(f, e);
+              }
+            } else {
+              // Inverted while loop
+              this.nextForeignPhrase = null;
+              return this.next();
+            }
+          }
+        } else {
+          throw new NoSuchElementException();
+        }
+      }
+    }; /* End local class definition */
+  } /* end iterator() */
+
+
+  private static void fileLengthMismatchException() throws RuntimeException {
+    throw new RuntimeException("Mismatched file lengths!");
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/subsample/BiCorpusFactory.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/subsample/BiCorpusFactory.java b/joshua-core/src/main/java/org/apache/joshua/subsample/BiCorpusFactory.java
new file mode 100644
index 0000000..eda3bf5
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/subsample/BiCorpusFactory.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.subsample;
+
+import java.io.File;
+import java.io.IOException;
+
+/**
+ * A callback closure for <code>Subsampler.subsample</code>. This class is used by
+ * {@link AlignedSubsampler} in order to "override" methods of {@link Subsampler}, minimizing code
+ * duplication.
+ * 
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @version $LastChangedDate$
+ */
+public class BiCorpusFactory {
+  // Making these final requires Java6, doesn't work in Java5
+  protected final String fpath;
+  protected final String epath;
+  protected final String apath;
+  protected final String extf;
+  protected final String exte;
+  protected final String exta;
+
+  public BiCorpusFactory(String fpath, String epath, String apath, String extf, String exte,
+      String exta) {
+    // The various concatenation has been moved up here
+    // to get it out of the loops where fromFiles is called.
+    this.fpath = (fpath == null ? "." : fpath) + File.separator;
+    this.epath = (epath == null ? "." : epath) + File.separator;
+    this.apath = (apath == null ? "." : apath) + File.separator;
+    this.extf = "." + extf;
+    this.exte = "." + exte;
+    this.exta = (exta == null ? null : "." + exta);
+  }
+
+
+  /** 
+   * Generate unaligned {@link org.apache.joshua.subsample.BiCorpus} by default.
+   * @param f todo
+   * @return an unaligned {@link org.apache.joshua.subsample.BiCorpus}
+   * @throws IOException if there is an error reading input file
+   */
+  public BiCorpus fromFiles(String f) throws IOException {
+    return this.unalignedFromFiles(f);
+  }
+
+  /**
+   * Generate unaligned BiCorpus.
+   * @param f todo
+   * @return an unaligned {@link org.apache.joshua.subsample.BiCorpus}
+   * @throws IOException if there is an error reading input file
+   */
+  public BiCorpus unalignedFromFiles(String f) throws IOException {
+    return new BiCorpus(fpath + f + extf, epath + f + exte);
+  }
+
+  /**
+   * Generate aligned BiCorpus.
+   * @param f todo
+   * @return an aligned {@link org.apache.joshua.subsample.BiCorpus}
+   * @throws IOException if there is an error reading input file
+   */
+  public BiCorpus alignedFromFiles(String f) throws IOException {
+    return new BiCorpus(fpath + f + extf, epath + f + exte, apath + f + exta);
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/subsample/PhrasePair.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/subsample/PhrasePair.java b/joshua-core/src/main/java/org/apache/joshua/subsample/PhrasePair.java
new file mode 100644
index 0000000..41c05d3
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/subsample/PhrasePair.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.subsample;
+
+import org.apache.joshua.corpus.Phrase;
+
+
+/**
+ * Phrase-aligned tuple class associating an F phrase, E phrase, and (possibly null)
+ * word-alignments. This is primarily for maintaining sentence-alignment.
+ * 
+ * @author UMD (Jimmy Lin, Chris Dyer, et al.)
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @version $LastChangedDate$
+ */
+public class PhrasePair {
+  // Making these final requires Java6, not Java5
+  private final Phrase f;
+  private final Phrase e;
+  private final Alignment a;
+
+  // ===============================================================
+  // Constructors
+  // ===============================================================
+  public PhrasePair(Phrase f_, Phrase e_) {
+    this(f_, e_, null);
+  }
+
+  public PhrasePair(Phrase f, Phrase e, Alignment a) {
+    this.f = f;
+    this.e = e;
+    this.a = a;
+  }
+
+  // ===============================================================
+  // Attributes
+  // ===============================================================
+  public Phrase getF() {
+    return f;
+  }
+
+  public Phrase getE() {
+    return e;
+  }
+
+  public Alignment getAlignment() {
+    return a;
+  }
+
+  // ===============================================================
+  // Methods
+  // ===============================================================
+  public float ratioFtoE() {
+    return ((float) this.f.size()) / ((float) this.e.size());
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/subsample/PhraseReader.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/subsample/PhraseReader.java b/joshua-core/src/main/java/org/apache/joshua/subsample/PhraseReader.java
new file mode 100644
index 0000000..6db216f
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/subsample/PhraseReader.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.subsample;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.joshua.corpus.BasicPhrase;
+
+/**
+ * Wrapper class to read in each line as a BasicPhrase.
+ * 
+ * @author UMD (Jimmy Lin, Chris Dyer, et al.)
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @version $LastChangedDate$
+ */
+public class PhraseReader extends BufferedReader {
+  private byte language;
+
+  public PhraseReader(Reader r, byte language) {
+    super(r);
+    this.language = language;
+  }
+
+  public BasicPhrase readPhrase() throws IOException {
+    String line = super.readLine();
+    return (line == null ? null : new BasicPhrase(this.language, line));
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/subsample/PhraseWriter.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/subsample/PhraseWriter.java b/joshua-core/src/main/java/org/apache/joshua/subsample/PhraseWriter.java
new file mode 100644
index 0000000..11bbf08
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/subsample/PhraseWriter.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.subsample;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+
+
+/**
+ * A PhrasePair-parallel BufferedWriter. In an ideal world we could get the compiler to inline all
+ * of this, to have zero-overhead while not duplicating code. Alas, Java's not that cool. The
+ * "final" could help on JIT at least.
+ * 
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @version $LastChangedDate$
+ */
+final public class PhraseWriter {
+  // Making these final requires Java6, not Java5
+  private final BufferedWriter wf;
+  private final BufferedWriter we;
+  private final BufferedWriter wa;
+
+  // ===============================================================
+  // Constructors
+  // ===============================================================
+  public PhraseWriter(BufferedWriter wf_, BufferedWriter we_) {
+    this(wf_, we_, null);
+  }
+
+  public PhraseWriter(BufferedWriter wf, BufferedWriter we, BufferedWriter wa) {
+    this.wf = wf;
+    this.we = we;
+    this.wa = wa;
+  }
+
+
+  // ===============================================================
+  // Methods
+  // ===============================================================
+  public void write(PhrasePair pp) throws IOException {
+    this.wf.write(pp.getF().toString());
+    this.we.write(pp.getE().toString());
+    if (null != this.wa) this.wa.write(pp.getAlignment().toString());
+  }
+
+  public void newLine() throws IOException {
+    this.wf.newLine();
+    this.we.newLine();
+    if (null != this.wa) this.wa.newLine();
+  }
+
+  public void flush() throws IOException {
+    this.wf.flush();
+    this.we.flush();
+    if (null != this.wa) this.wa.flush();
+  }
+
+  public void close() throws IOException {
+    this.wf.close();
+    this.we.close();
+    if (null != this.wa) this.wa.close();
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/subsample/Subsampler.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/subsample/Subsampler.java b/joshua-core/src/main/java/org/apache/joshua/subsample/Subsampler.java
new file mode 100644
index 0000000..36e1925
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/subsample/Subsampler.java
@@ -0,0 +1,246 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.subsample;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.joshua.corpus.BasicPhrase;
+import org.apache.joshua.corpus.Phrase;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A class for subsampling a large (F,E)-parallel sentence-aligned corpus to generate a smaller
+ * corpus whose N-grams are relevant to some seed corpus. The idea of subsampling owes to Kishore
+ * Papineni.
+ * 
+ * @author UMD (Jimmy Lin, Chris Dyer, et al.)
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @version $LastChangedDate$
+ */
+public class Subsampler {
+
+  private static final Logger LOG = LoggerFactory.getLogger(Subsampler.class);
+
+  protected Map<Phrase, Integer> ngramCounts;
+  protected int maxN;
+  protected int targetCount;
+  protected int maxSubsample = 1500000;
+
+  protected static final int MAX_SENTENCE_LENGTH = 100;
+  protected static final int MIN_RATIO_LENGTH = 10;
+
+
+  public Subsampler(String[] testFiles, int maxN, int targetCount) throws IOException {
+    this.maxN = maxN;
+    this.targetCount = targetCount;
+    this.ngramCounts = loadNgrams(testFiles);
+  }
+
+  private HashMap<Phrase, Integer> loadNgrams(String[] files) throws IOException {
+    HashMap<Phrase, Integer> map = new HashMap<Phrase, Integer>();
+    for (String fn : files) {
+      LOG.debug("Loading test set from {}", fn);
+
+      PhraseReader reader = new PhraseReader(new FileReader(fn), (byte) 1);
+      Phrase phrase;
+      int lineCount = 0;
+      try {
+        while ((phrase = reader.readPhrase()) != null) {
+          lineCount++;
+          List<Phrase> ngrams = phrase.getSubPhrases(this.maxN);
+          for (Phrase ngram : ngrams)
+            map.put(ngram, 0);
+        }
+      } finally {
+        reader.close();
+      }
+      LOG.debug("Processed {} lines in {}", lineCount, fn);
+    }
+    LOG.debug("Test set: {} ngrams", map.size());
+    return map;
+  }
+
+
+  /**
+   * The general subsampler function for external use.
+   * 
+   * @param filelist list of source files to subsample from
+   * @param targetFtoERatio goal for ratio of output F length to output E length
+   * @param extf extension of F files
+   * @param exte extension of E files
+   * @param fpath path to source F files
+   * @param epath path to source E files
+   * @param output basename for output files (will append extensions)
+   * @throws IOException if there is an issue reading one of the input files
+   */
+  public void subsample(String filelist, float targetFtoERatio, String extf, String exte,
+      String fpath, String epath, String output) throws IOException {
+    this.subsample(filelist, targetFtoERatio, new PhraseWriter(new BufferedWriter(
+        new OutputStreamWriter(new FileOutputStream(output + "." + extf), "UTF8")),
+        new BufferedWriter(
+            new OutputStreamWriter(new FileOutputStream(output + "." + exte), "UTF8"))),
+        new BiCorpusFactory(fpath, epath, null, extf, exte, null));
+  }
+
+  /**
+   * The main wrapper for the subsample worker. Closes the 
+   * {@link org.apache.joshua.subsample.PhraseWriter} before exiting.
+   * @param filelist list of source files to subsample from
+   * @param targetFtoERatio goal for ratio of output F length to output E length
+   * @param out a {@link org.apache.joshua.subsample.PhraseWriter} to flush data to
+   * @param bcFactory used to generate a sentence-aligned {@link org.apache.joshua.subsample.BiCorpus}
+   * @throws IOException if there is an issue reading one of the input files
+   */
+  protected void subsample(String filelist, float targetFtoERatio, PhraseWriter out,
+      BiCorpusFactory bcFactory) throws IOException {
+    try {
+      // Read filenames into a list
+      List<String> files = new ArrayList<String>();
+      {
+        FileReader fr = null;
+        BufferedReader br = null;
+        try {
+          fr = new FileReader(filelist);
+          br = new BufferedReader(fr);
+          String file;
+          while ((file = br.readLine()) != null) {
+            files.add(file);
+          }
+        } finally {
+          // Maybe redundant, but UMD's FixBugs says to
+          // close br (and close is idempotent anyways)
+          if (null != fr) fr.close();
+          if (null != br) br.close();
+        }
+      }
+
+      int totalSubsampled = 0;
+      // Iterating on files in order biases towards files
+      // earlier in the list
+      for (String f : files) {
+        LOG.info("Loading training data: {}", f);
+
+        BiCorpus bc = bcFactory.fromFiles(f);
+
+        HashMap<PhrasePair, PhrasePair> set = new HashMap<PhrasePair, PhrasePair>();
+
+        int binsize = 10; // BUG: Magic-Number
+        int max_k = MAX_SENTENCE_LENGTH / binsize;
+        LOG.debug("Looking in length range");
+        // Iterating bins from small to large biases
+        // towards short sentences
+        for (int k = 0; k < max_k; k++) {
+          LOG.debug(" [{}, {}]", (k * binsize + 1), ((k + 1) * binsize));
+          this.subsample(set, bc, k * binsize + 1, (k + 1) * binsize, targetFtoERatio);
+
+          if (set.size() + totalSubsampled > maxSubsample) break;
+        }
+
+        float ff = 0.0f;
+        float ef = 0.0f;
+        for (PhrasePair pp : set.keySet()) {
+          // Get pp.ratioFtoE() for all pp
+          ff += pp.getF().size();
+          ef += pp.getE().size();
+
+          out.write(set.get(pp));
+          out.newLine();
+        }
+        out.flush();
+
+        totalSubsampled += set.size();
+        LOG.info("current={} [total={}] currentRatio={}", set.size(), totalSubsampled, (ff / ef));
+
+        // TODO: is this gc actually dubious? Or
+        // does profiling show it helps? We only
+        // do it once per file, so it's not a
+        // performance blackhole.
+        set = null;
+        bc = null;
+        System.gc();
+      }
+    } finally {
+      out.close();
+    }
+  }
+
+  /**
+   * The worker function for subsampling.
+   * 
+   * @param set The set to put selected sentences into
+   * @param bc The sentence-aligned corpus to read from
+   * @param minLength The minimum F sentence length
+   * @param maxLength The maximum F sentence length
+   * @param targetFtoERatio The desired ratio of F length to E length
+   */
+  private void subsample(HashMap<PhrasePair, PhrasePair> set, BiCorpus bc, int minLength,
+      int maxLength, float targetFtoERatio) {
+    for (PhrasePair pp : bc) {
+      PhrasePair lowercase_pp =
+          new PhrasePair(new BasicPhrase((byte) 1, pp.getF().toString().toLowerCase()),
+              new BasicPhrase((byte) 1, pp.getE().toString().toLowerCase()), pp.getAlignment());
+
+      {
+        int eLength = pp.getE().size();
+        if (eLength == 0 || eLength > MAX_SENTENCE_LENGTH) continue;
+      }
+
+      int fLength = pp.getF().size();
+      if (fLength == 0 || fLength < minLength || fLength > maxLength
+          || fLength > MAX_SENTENCE_LENGTH) continue;
+      if (fLength > 10 && targetFtoERatio != 0.0f) {
+        float ratio = pp.ratioFtoE();
+        if (fLength >= MIN_RATIO_LENGTH
+            && (ratio > 1.3f * targetFtoERatio || ratio * 1.3f < targetFtoERatio)) continue;
+      }
+      if (set.containsKey(lowercase_pp)) continue;
+
+      // at this point, length checks out and the sentence hasn't
+      // been selected yet
+
+      List<Phrase> ngrams = pp.getF().getSubPhrases(this.maxN);
+      boolean useSentence = false;
+      for (Phrase ng : ngrams) {
+        Integer count = this.ngramCounts.get(ng);
+        if (count == null) continue;
+        if (count < targetCount) {
+          useSentence = true;
+          count++;
+          this.ngramCounts.put(ng, count);
+        }
+      }
+      if (useSentence) set.put(lowercase_pp, pp);
+    }
+  }
+
+
+  public static void main(String[] args) {
+    new SubsamplerCLI().runMain(args);
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/subsample/SubsamplerCLI.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/subsample/SubsamplerCLI.java b/joshua-core/src/main/java/org/apache/joshua/subsample/SubsamplerCLI.java
new file mode 100644
index 0000000..5a287c3
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/subsample/SubsamplerCLI.java
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.subsample;
+
+import java.io.IOException;
+
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+
+
+/**
+ * This class defines a callback closure to allow "overriding" the main function in subclasses of
+ * {@link Subsampler}, without duplicating code. For all subclasses, CLI <code>Options</code> should
+ * be members of the class (so they're visible to <code>runSubsampler</code> as well as
+ * <code>getCliOptions</code>), the <code>getCliOptions</code> method should be overridden to add
+ * the additional options (via <code>super</code> to keep the old options), and the
+ * <code>runSubsampler</code> method should be overridden to do the primary work for main. The
+ * <code>runMain</code> method ties everything together and should not need modification. Due to the
+ * one-use nature of subclasses of <code>SubsampleCLI</code>, they generally should be implemented
+ * as anonymous local classes.
+ * 
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @version $LastChangedDate$
+ */
+@SuppressWarnings("static-access")
+public class SubsamplerCLI {
+  // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+  protected final Option ot = OptionBuilder.withArgName("listfile").hasArg()
+      .withDescription("A file containing a list of training file basenames (what to sample from)")
+      .isRequired().create("training");
+
+  // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+  protected final Option otest = OptionBuilder.withArgName("file").hasArgs()
+      .withDescription("The test file (what to sample for)").isRequired().create("test");
+
+  // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+  protected final Option ooutput = OptionBuilder.withArgName("basename").hasArgs()
+      .withDescription("File basename for output training corpus").isRequired().create("output");
+
+  // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+  protected final Option of = OptionBuilder.withArgName("lang").hasArg()
+      .withDescription("Foreign language extension").isRequired().create("f");
+
+  // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+  protected final Option oe = OptionBuilder.withArgName("lang").hasArg()
+      .withDescription("Native language extension").isRequired().create("e");
+
+  // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+  protected final Option ofpath = OptionBuilder.withArgName("path").hasArg()
+      .withDescription("Directory containing foreign language files").create("fpath");
+
+  // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+  protected final Option oepath = OptionBuilder.withArgName("path").hasArg()
+      .withDescription("Directory containing native language files").create("epath");
+
+  // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+  protected final Option oratio = OptionBuilder.withArgName("ratio").hasArg()
+      .withDescription("Target F/E ratio").create("ratio");
+
+  /**
+   * Return all Options. The HelpFormatter will print them in sorted order, so it doesn't matter
+   * when we add them. Subclasses should override this method by adding more options.
+   * @return all of the {@link org.apache.commons.cli.Options}
+   */
+  public Options getCliOptions() {
+    return new Options().addOption(ot).addOption(otest).addOption(of).addOption(oe)
+        .addOption(ofpath).addOption(oepath).addOption(oratio).addOption(ooutput);
+  }
+
+  /**
+   * This method should be overridden to return the class used in 
+   * {@link org.apache.joshua.subsample.SubsamplerCLI#runSubsampler(String[], int, int, float)}.
+   * @return the {@link org.apache.joshua.subsample.Subsampler} implementation
+   */
+  public String getClassName() {
+    return Subsampler.class.getName();
+  }
+
+  /**
+   * Callback to run the subsampler. This function needs access to the variables holding each
+   * Option, thus all this closure nonsense.
+   * @param testFiles a String array of test files
+   * @param maxN todo
+   * @param targetCount todo
+   * @param ratio todo
+   * @throws IOException if there is an issue whilst reading input files
+   */
+  public void runSubsampler(String[] testFiles, int maxN, int targetCount, float ratio)
+      throws IOException {
+    new Subsampler(testFiles, maxN, targetCount).subsample(ot.getValue(), ratio, of.getValue(),
+        oe.getValue(), ofpath.getValue(), oepath.getValue(), ooutput.getValue());
+  }
+
+  /**
+   * Non-static version of main so that we can define anonymous local classes to override or extend
+   * the above.
+   * @param args a String array of input options
+   */
+  public void runMain(String[] args) {
+    Options o = this.getCliOptions();
+    try {
+      new GnuParser().parse(o, args);
+    } catch (ParseException pe) {
+      // The message from pe is ugly, so we omit it.
+      System.err.println("Error parsing command line");
+      new HelpFormatter().printHelp(this.getClassName(), o);
+      System.exit(1);
+    }
+
+    try {
+      float ratio = 0.8f;
+      if (this.oratio.getValue() != null) {
+        ratio = Float.parseFloat(this.oratio.getValue());
+      }
+      this.runSubsampler(this.otest.getValues(), 12, 20, ratio);
+    } catch (Exception e) {
+      e.printStackTrace();
+      System.exit(1);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/subsample/package-info.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/subsample/package-info.java b/joshua-core/src/main/java/org/apache/joshua/subsample/package-info.java
new file mode 100644
index 0000000..b7fe744
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/subsample/package-info.java
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/**
+ * Provides executables Subsampler and AlignedSubsampler, 
+ * for subsampling from large training corpora based on a 
+ * test corpus.
+ */
+package org.apache.joshua.subsample;
+