You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/06/23 18:45:37 UTC
[26/60] [partial] incubator-joshua git commit: maven multi-module
layout 1st commit: moving files into joshua-core
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/server/ServerThread.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/server/ServerThread.java b/joshua-core/src/main/java/org/apache/joshua/server/ServerThread.java
new file mode 100644
index 0000000..d054515
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/server/ServerThread.java
@@ -0,0 +1,294 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.server;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.io.StringReader;
+import java.net.Socket;
+import java.net.SocketException;
+import java.net.URLDecoder;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+
+import com.sun.net.httpserver.HttpExchange;
+import com.sun.net.httpserver.HttpHandler;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.Decoder;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.Translation;
+import org.apache.joshua.decoder.Translations;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.ff.tm.Trie;
+import org.apache.joshua.decoder.ff.tm.format.HieroFormatReader;
+import org.apache.joshua.decoder.io.JSONMessage;
+import org.apache.joshua.decoder.io.TranslationRequestStream;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class handles a concurrent request for translations from a newly opened socket, for
+ * both raw TCP/IP connections and for HTTP connections.
+ *
+ */
+public class ServerThread extends Thread implements HttpHandler {
+
+ private static final Logger LOG = LoggerFactory.getLogger(ServerThread.class);
+ private static final Charset FILE_ENCODING = Charset.forName("UTF-8");
+
+ private final JoshuaConfiguration joshuaConfiguration;
+ private Socket socket = null;
+ private final Decoder decoder;
+
+ /**
+ * Creates a new TcpServerThread that can run a set of translations.
+ *
+ * @param socket the socket representing the input/output streams
+ * @param decoder the configured decoder that handles performing translations
+ * @param joshuaConfiguration a populated {@link org.apache.joshua.decoder.JoshuaConfiguration}
+ */
+ public ServerThread(Socket socket, Decoder decoder, JoshuaConfiguration joshuaConfiguration) {
+ this.joshuaConfiguration = joshuaConfiguration;
+ this.socket = socket;
+ this.decoder = decoder;
+ }
+
+ /**
+ * Reads the input from the socket, submits the input to the decoder, transforms the resulting
+ * translations into the required output format, writes out the formatted output, then closes the
+ * socket.
+ */
+ @Override
+ public void run() {
+
+ //TODO: use try-with-resources block
+ try {
+ BufferedReader reader = new BufferedReader(new InputStreamReader(socket.getInputStream(), FILE_ENCODING));
+
+ TranslationRequestStream request = new TranslationRequestStream(reader, joshuaConfiguration);
+
+ try {
+ Translations translations = decoder.decodeAll(request);
+
+ OutputStream out = socket.getOutputStream();
+
+ for (Translation translation: translations) {
+ out.write(translation.toString().getBytes());
+ }
+
+ } catch (SocketException e) {
+ LOG.error(" Socket interrupted", e);
+ request.shutdown();
+ } finally {
+ reader.close();
+ socket.close();
+ }
+ } catch (IOException e) {
+ LOG.error(e.getMessage(), e);
+ }
+ }
+
+ public HashMap<String, String> queryToMap(String query){
+ HashMap<String, String> result = new HashMap<String, String>();
+ for (String param : query.split("&")) {
+ String pair[] = param.split("=");
+ if (pair.length > 1) {
+ result.put(pair[0], pair[1]);
+ } else {
+ result.put(pair[0], "");
+ }
+ }
+ return result;
+ }
+
+ private class HttpWriter extends OutputStream {
+
+ private HttpExchange client = null;
+ private OutputStream out = null;
+
+ public HttpWriter(HttpExchange client) {
+ this.client = client;
+ client.getResponseHeaders().add("Access-Control-Allow-Origin", "*");
+ }
+
+ @Override
+ public void write(byte[] response) throws IOException {
+ client.sendResponseHeaders(200, response.length);
+ out = client.getResponseBody();
+ out.write(response);
+ out.close();
+ }
+
+ @Override
+ public void write(int b) throws IOException {
+ out.write(b);
+ }
+ }
+
+ /**
+ * Called to handle an HTTP connection. This looks for metadata in the URL string, which is processed
+ * if present. It also then handles returning a JSON-formatted object to the caller.
+ *
+ * @param client the client connection
+ */
+ @Override
+ public synchronized void handle(HttpExchange client) throws IOException {
+
+ HashMap<String, String> params = queryToMap(URLDecoder.decode(client.getRequestURI().getQuery(), "UTF-8"));
+ String query = params.get("q");
+ String meta = params.get("meta");
+
+ BufferedReader reader = new BufferedReader(new StringReader(query));
+ TranslationRequestStream request = new TranslationRequestStream(reader, joshuaConfiguration);
+
+ Translations translations = decoder.decodeAll(request);
+ JSONMessage message = new JSONMessage();
+ if (meta != null && ! meta.isEmpty())
+ handleMetadata(meta, message);
+
+ for (Translation translation: translations) {
+ LOG.info("TRANSLATION: '{}' with {} k-best items", translation, translation.getStructuredTranslations().size());
+ message.addTranslation(translation);
+ }
+
+ OutputStream out = new HttpWriter(client);
+ out.write(message.toString().getBytes());
+ if (LOG.isDebugEnabled())
+ LOG.debug(message.toString());
+ out.close();
+
+ reader.close();
+ }
+
+ /**
+ * Processes metadata commands received in the HTTP request. Some commands result in sending data back.
+ *
+ * @param meta the metadata request
+ * @return result string (for some commands)
+ */
+ private void handleMetadata(String meta, JSONMessage message) {
+ String[] tokens = meta.split("\\s+", 2);
+ String type = tokens[0];
+ String args = tokens.length > 1 ? tokens[1] : "";
+
+ if (type.equals("get_weight")) {
+ String weight = tokens[1];
+ LOG.info("WEIGHT: %s = %.3f", weight, Decoder.weights.getWeight(weight));
+
+ } else if (type.equals("set_weights")) {
+ // Change a decoder weight
+ String[] argTokens = args.split("\\s+");
+ for (int i = 0; i < argTokens.length; i += 2) {
+ String feature = argTokens[i];
+ String newValue = argTokens[i+1];
+ float old_weight = Decoder.weights.getWeight(feature);
+ Decoder.weights.set(feature, Float.parseFloat(newValue));
+ LOG.info("set_weights: {} {} -> {}", feature, old_weight, Decoder.weights.getWeight(feature));
+ }
+
+ message.addMetaData("weights " + Decoder.weights.toString());
+
+ } else if (type.equals("get_weights")) {
+ message.addMetaData("weights " + Decoder.weights.toString());
+
+ } else if (type.equals("add_rule")) {
+ String argTokens[] = args.split(" \\|\\|\\| ");
+
+ if (argTokens.length < 3) {
+ LOG.error("* INVALID RULE '{}'", meta);
+ return;
+ }
+
+ String lhs = argTokens[0];
+ String source = argTokens[1];
+ String target = argTokens[2];
+ String featureStr = "";
+ if (argTokens.length > 3)
+ featureStr = argTokens[3];
+
+ /* Prepend source and target side nonterminals for phrase-based decoding. Probably better
+ * handled in each grammar type's addRule() function.
+ */
+ String ruleString = (joshuaConfiguration.search_algorithm.equals("stack"))
+ ? String.format("%s ||| [X,1] %s ||| [X,1] %s ||| custom=1 %s", lhs, source, target, featureStr)
+ : String.format("%s ||| %s ||| %s ||| custom=1 %s", lhs, source, target, featureStr);
+
+ Rule rule = new HieroFormatReader().parseLine(ruleString);
+ decoder.addCustomRule(rule);
+
+ LOG.info("Added custom rule {}", rule.toString());
+
+ } else if (type.equals("list_rules")) {
+
+ LOG.info("list_rules");
+
+ // Walk the the grammar trie
+ ArrayList<Trie> nodes = new ArrayList<Trie>();
+ nodes.add(decoder.getCustomPhraseTable().getTrieRoot());
+
+ while (nodes.size() > 0) {
+ Trie trie = nodes.remove(0);
+
+ if (trie == null)
+ continue;
+
+ if (trie.hasRules()) {
+ for (Rule rule: trie.getRuleCollection().getRules()) {
+ message.addRule(rule.toString());
+ LOG.debug("Found rule: " + rule);
+ }
+ }
+
+ if (trie.getExtensions() != null)
+ nodes.addAll(trie.getExtensions());
+ }
+
+ } else if (type.equals("remove_rule")) {
+
+ Rule rule = new HieroFormatReader().parseLine(args);
+
+ LOG.info("remove_rule " + rule);
+
+ Trie trie = decoder.getCustomPhraseTable().getTrieRoot();
+ int[] sourceTokens = rule.getFrench();
+ for (int i = 0; i < sourceTokens.length; i++) {
+ Trie nextTrie = trie.match(sourceTokens[i]);
+ if (nextTrie == null)
+ return;
+
+ trie = nextTrie;
+ }
+
+ if (trie.hasRules()) {
+ for (Rule ruleCand: trie.getRuleCollection().getRules()) {
+ if (Arrays.equals(rule.getEnglish(), ruleCand.getEnglish())) {
+ trie.getRuleCollection().getRules().remove(ruleCand);
+ break;
+ }
+ }
+ return;
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/server/TcpServer.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/server/TcpServer.java b/joshua-core/src/main/java/org/apache/joshua/server/TcpServer.java
new file mode 100644
index 0000000..e054186
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/server/TcpServer.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.server;
+
+import java.io.IOException;
+import java.net.ServerSocket;
+
+import org.apache.joshua.decoder.Decoder;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * TCP/IP server. Accepts newline-separated input sentences written to the socket, translates them
+ * all, and writes the resulting translations back out to the socket.
+ */
+public class TcpServer {
+ private static final Logger LOG = LoggerFactory.getLogger(TcpServer.class);
+ private final JoshuaConfiguration joshuaConfiguration;
+ private Decoder decoder;
+ private int port;
+
+ public TcpServer(Decoder decoder, int port,JoshuaConfiguration joshuaConfiguration) {
+ this.joshuaConfiguration = joshuaConfiguration;
+ this.decoder = decoder;
+ this.port = port;
+ }
+
+ /**
+ * Listens on a port for new socket connections. Concurrently handles multiple socket connections.
+ */
+ public void start() {
+
+ try {
+ ServerSocket serverSocket = new ServerSocket(joshuaConfiguration.server_port);
+ LOG.info("** TCP Server running and listening on port {}.", port);
+
+ boolean listening = true;
+ while (listening)
+ new ServerThread(serverSocket.accept(), decoder, joshuaConfiguration).start();
+
+ serverSocket.close();
+
+ } catch (IOException e) {
+ throw new RuntimeException(String.format("Could not listen on port: %d.",
+ joshuaConfiguration.server_port));
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/subsample/AlignedSubsampler.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/subsample/AlignedSubsampler.java b/joshua-core/src/main/java/org/apache/joshua/subsample/AlignedSubsampler.java
new file mode 100644
index 0000000..2915685
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/subsample/AlignedSubsampler.java
@@ -0,0 +1,103 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.subsample;
+
+import java.io.BufferedWriter;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+
+
+/**
+ * A subsampler which takes in word-alignments as well as the F and E files. To remove redundant
+ * code, this class uses callback techniques in order to "override" the superclass methods.
+ *
+ * @see org.apache.joshua.subsample.Subsampler
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @version $LastChangedDate$
+ */
+public class AlignedSubsampler extends Subsampler {
+
+ public AlignedSubsampler(String[] testFiles, int maxN, int targetCount) throws IOException {
+ super(testFiles, maxN, targetCount);
+ }
+
+
+ /**
+ * @param filelist list of source files to subsample from
+ * @param targetFtoERatio goal for ratio of output F length to output E length
+ * @param extf extension of F files
+ * @param exte extension of E files
+ * @param exta extension of alignment files
+ * @param fpath path to source F files
+ * @param epath path to source E files
+ * @param apath path to source alignment files
+ * @param output basename for output files (will append extensions)
+ * @throws IOException if there is an error reading the input file(s)
+ */
+ public void subsample(String filelist, float targetFtoERatio, String extf, String exte,
+ String exta, String fpath, String epath, String apath, String output) throws IOException {
+ this.subsample(filelist, targetFtoERatio, new PhraseWriter(new BufferedWriter(
+ new OutputStreamWriter(new FileOutputStream(output + "." + extf), "UTF8")),
+ new BufferedWriter(
+ new OutputStreamWriter(new FileOutputStream(output + "." + exte), "UTF8")),
+ new BufferedWriter(
+ new OutputStreamWriter(new FileOutputStream(output + "." + exta), "UTF8"))),
+ new BiCorpusFactory(fpath, epath, apath, extf, exte, exta) { /* Local class definition */
+ public BiCorpus fromFiles(String f) throws IOException {
+ return this.alignedFromFiles(f);
+ }
+ });
+ }
+
+
+ @SuppressWarnings("static-access")
+ public static void main(String[] args) {
+ new SubsamplerCLI() { /* Local class definition */
+
+ // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+ protected final Option oa = OptionBuilder.withArgName("lang").hasArg()
+ .withDescription("Word alignment extension").isRequired().create("a");
+
+ // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+ protected final Option oapath = OptionBuilder.withArgName("path").hasArg()
+ .withDescription("Directory containing word alignment files").create("apath");
+
+ public Options getCliOptions() {
+ return super.getCliOptions().addOption(oa).addOption(oapath);
+ }
+
+ public String getClassName() {
+ return AlignedSubsampler.class.getName();
+ }
+
+ public void runSubsampler(String[] testFiles, int maxN, int targetCount, float ratio)
+ throws IOException {
+ new AlignedSubsampler(testFiles, maxN, targetCount).subsample(ot.getValue(), ratio,
+ of.getValue(), oe.getValue(), oa.getValue(), ofpath.getValue(), oepath.getValue(),
+ oapath.getValue(), ooutput.getValue());
+ }
+
+ }.runMain(args);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/subsample/Alignment.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/subsample/Alignment.java b/joshua-core/src/main/java/org/apache/joshua/subsample/Alignment.java
new file mode 100644
index 0000000..073eb5c
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/subsample/Alignment.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.subsample;
+
+/**
+ * A set of word alignments between an F phrase and an E phrase. The implementation uses a
+ * two-dimensional bit vector, though for our purposes we could just keep the original string around
+ * (which would save lots of time parsing and reconstructing the string).
+ *
+ * @author UMD (Jimmy Lin, Chris Dyer, et al.)
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @version $LastChangedDate$
+ */
+public class Alignment {
+ private short eLength;
+ private short fLength;
+ private M2 aligned;
+
+ public Alignment(short fLength, short eLength, String alignments) {
+ this.eLength = eLength;
+ this.fLength = fLength;
+ this.aligned = new M2(fLength, eLength);
+
+ if (alignments == null || alignments.length() == 0) {
+ return;
+ }
+ String[] als = alignments.split("\\s+"); // TODO: joshua.util.Regex
+ for (String al : als) {
+ String[] pair = al.split("-");
+ if (pair.length != 2)
+ throw new IllegalArgumentException("Malformed alignment string: " + alignments);
+ short f = Short.parseShort(pair[0]);
+ short e = Short.parseShort(pair[1]);
+ if (f >= fLength || e >= eLength)
+ throw new IndexOutOfBoundsException("out of bounds: " + f + "," + e);
+ aligned.set(f, e);
+ }
+ }
+
+
+ public String toString() {
+ StringBuffer sb = new StringBuffer();
+ for (short i = 0; i < fLength; i++)
+ for (short j = 0; j < eLength; j++)
+ if (aligned.get(i, j)) sb.append(i).append('-').append(j).append(' ');
+
+ // Remove trailing space
+ if (sb.length() > 0) sb.delete(sb.length() - 1, sb.length());
+
+ return sb.toString();
+ }
+
+
+ /** A (short,short)->boolean map for storing alignments. */
+ private final static class M2 {
+ private short width;
+ private boolean[] bits;
+
+ public M2(short f, short e) {
+ width = f;
+ bits = new boolean[f * e];
+ }
+
+ public boolean get(short f, short e) {
+ return bits[width * e + f];
+ }
+
+ public void set(short f, short e) {
+ try {
+ bits[width * e + f] = true;
+ } catch (ArrayIndexOutOfBoundsException ee) {
+ throw new RuntimeException("Set(" + f + ", " + e + "): caught " + ee);
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/subsample/BiCorpus.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/subsample/BiCorpus.java b/joshua-core/src/main/java/org/apache/joshua/subsample/BiCorpus.java
new file mode 100644
index 0000000..06ec0e9
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/subsample/BiCorpus.java
@@ -0,0 +1,186 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.subsample;
+
+import java.io.BufferedReader;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+import org.apache.joshua.corpus.Phrase;
+
+/**
+ * Class for representing a sentence-aligned bi-corpus (with optional word-alignments).
+ * <p>
+ * In order to avoid memory crashes we no longer extend an ArrayList, which tries to cache the
+ * entire file in memory at once. This means we'll re-read through each file (1 +
+ * {@link Subsampler#MAX_SENTENCE_LENGTH} / binsize) times where binsize is determined by the
+ * <code>subsample(String, float, PhraseWriter, BiCorpusFactory)</code> method.
+ *
+ * @author UMD (Jimmy Lin, Chris Dyer, et al.)
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @version $LastChangedDate$
+ */
+public class BiCorpus implements Iterable<PhrasePair> {
+ protected final String foreignFileName;
+ protected final String nativeFileName;
+ protected final String alignmentFileName;
+
+ /**
+ * Constructor for unaligned BiCorpus.
+ * @param foreignFileName todo
+ * @param nativeFileName todo
+ * @throws IOException todo
+ */
+ public BiCorpus(String foreignFileName, String nativeFileName) throws IOException {
+ this(foreignFileName, nativeFileName, null);
+ }
+
+ /**
+ * Constructor for word-aligned BiCorpus.
+ * @param foreignFileName todo
+ * @param nativeFileName todo
+ * @param alignmentFileName todo
+ * @throws IOException todo
+ * @throws IllegalArgumentException todo
+ * @throws IndexOutOfBoundsException todo
+ */
+ public BiCorpus(String foreignFileName, String nativeFileName, String alignmentFileName)
+ throws IOException, IllegalArgumentException, IndexOutOfBoundsException {
+ this.foreignFileName = foreignFileName;
+ this.nativeFileName = nativeFileName;
+ this.alignmentFileName = alignmentFileName;
+
+ // Check for fileLengthMismatchException
+ // Of course, that will be checked for in each iteration
+ //
+ // We write it this way to avoid warnings from the foreach style loop
+ Iterator<PhrasePair> it = iterator();
+ while (it.hasNext()) {
+ it.next();
+ }
+ }
+
+
+ // ===============================================================
+ // Methods
+ // ===============================================================
+ // BUG: We don't close file handles. The other reader classes apparently have finalizers to handle
+ // this well enough for our purposes, but we should migrate to using joshua.util.io.LineReader and
+ // be sure to close it in the end.
+
+ // We're not allowed to throw exceptions from Iterator/Iterable
+ // so we have evil boilerplate to crash the system
+ /**
+ * Iterate through the files represented by this <code>BiCorpus</code>, returning a
+ * {@link PhrasePair} for each pair (or triple) of lines.
+ */
+ @SuppressWarnings("resource")
+ public Iterator<PhrasePair> iterator() {
+ PhraseReader closureRF = null;
+ PhraseReader closureRE = null;
+ BufferedReader closureRA = null;
+ try {
+ closureRF = new PhraseReader(new FileReader(this.foreignFileName), (byte) 1);
+ closureRE = new PhraseReader(new FileReader(this.nativeFileName), (byte) 0);
+ closureRA =
+ (null == this.alignmentFileName ? null : new BufferedReader(new FileReader(
+ this.alignmentFileName)));
+ } catch (FileNotFoundException e) {
+ throw new RuntimeException("File not found", e);
+ }
+ // Making final for closure capturing in the local class definition
+ final PhraseReader rf = closureRF;
+ final PhraseReader re = closureRE;
+ final BufferedReader ra = closureRA;
+
+ return new Iterator<PhrasePair>() { /* Local class definition */
+ private Phrase nextForeignPhrase = null;
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+
+ public boolean hasNext() {
+ if (null == this.nextForeignPhrase) {
+ try {
+ this.nextForeignPhrase = rf.readPhrase();
+ } catch (IOException e) {
+ throw new RuntimeException("IOException", e);
+ }
+ }
+ return null != this.nextForeignPhrase;
+ }
+
+ public PhrasePair next() {
+ if (this.hasNext()) {
+ Phrase f = this.nextForeignPhrase;
+
+ Phrase e = null;
+ try {
+ e = re.readPhrase();
+ } catch (IOException ioe) {
+ throw new RuntimeException("IOException", ioe);
+ }
+ if (null == e) {
+ fileLengthMismatchException();
+ return null; // Needed to make javac happy
+ } else {
+ if (e.size() != 0 && f.size() != 0) {
+ if (null != ra) {
+ String line = null;
+ try {
+ line = ra.readLine();
+ } catch (IOException ioe) {
+ throw new RuntimeException("IOException", ioe);
+ }
+
+ if (null == line) {
+ fileLengthMismatchException();
+ return null; // Needed to make javac happy
+ } else {
+ Alignment a = new Alignment((short) f.size(), (short) e.size(), line);
+
+ this.nextForeignPhrase = null;
+ return new PhrasePair(f, e, a);
+ }
+ } else {
+ this.nextForeignPhrase = null;
+ return new PhrasePair(f, e);
+ }
+ } else {
+ // Inverted while loop
+ this.nextForeignPhrase = null;
+ return this.next();
+ }
+ }
+ } else {
+ throw new NoSuchElementException();
+ }
+ }
+ }; /* End local class definition */
+ } /* end iterator() */
+
+
+ private static void fileLengthMismatchException() throws RuntimeException {
+ throw new RuntimeException("Mismatched file lengths!");
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/subsample/BiCorpusFactory.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/subsample/BiCorpusFactory.java b/joshua-core/src/main/java/org/apache/joshua/subsample/BiCorpusFactory.java
new file mode 100644
index 0000000..eda3bf5
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/subsample/BiCorpusFactory.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.subsample;
+
+import java.io.File;
+import java.io.IOException;
+
+/**
+ * A callback closure for <code>Subsampler.subsample</code>. This class is used by
+ * {@link AlignedSubsampler} in order to "override" methods of {@link Subsampler}, minimizing code
+ * duplication.
+ *
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @version $LastChangedDate$
+ */
+public class BiCorpusFactory {
+ // Making these final requires Java6, doesn't work in Java5
+ protected final String fpath;
+ protected final String epath;
+ protected final String apath;
+ protected final String extf;
+ protected final String exte;
+ protected final String exta;
+
+ public BiCorpusFactory(String fpath, String epath, String apath, String extf, String exte,
+ String exta) {
+ // The various concatenation has been moved up here
+ // to get it out of the loops where fromFiles is called.
+ this.fpath = (fpath == null ? "." : fpath) + File.separator;
+ this.epath = (epath == null ? "." : epath) + File.separator;
+ this.apath = (apath == null ? "." : apath) + File.separator;
+ this.extf = "." + extf;
+ this.exte = "." + exte;
+ this.exta = (exta == null ? null : "." + exta);
+ }
+
+
+ /**
+ * Generate unaligned {@link org.apache.joshua.subsample.BiCorpus} by default.
+ * @param f todo
+ * @return an unaligned {@link org.apache.joshua.subsample.BiCorpus}
+ * @throws IOException if there is an error reading input file
+ */
+ public BiCorpus fromFiles(String f) throws IOException {
+ return this.unalignedFromFiles(f);
+ }
+
+ /**
+ * Generate unaligned BiCorpus.
+ * @param f todo
+ * @return an unaligned {@link org.apache.joshua.subsample.BiCorpus}
+ * @throws IOException if there is an error reading input file
+ */
+ public BiCorpus unalignedFromFiles(String f) throws IOException {
+ return new BiCorpus(fpath + f + extf, epath + f + exte);
+ }
+
+ /**
+ * Generate aligned BiCorpus.
+ * @param f todo
+ * @return an aligned {@link org.apache.joshua.subsample.BiCorpus}
+ * @throws IOException if there is an error reading input file
+ */
+ public BiCorpus alignedFromFiles(String f) throws IOException {
+ return new BiCorpus(fpath + f + extf, epath + f + exte, apath + f + exta);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/subsample/PhrasePair.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/subsample/PhrasePair.java b/joshua-core/src/main/java/org/apache/joshua/subsample/PhrasePair.java
new file mode 100644
index 0000000..41c05d3
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/subsample/PhrasePair.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.subsample;
+
+import org.apache.joshua.corpus.Phrase;
+
+
+/**
+ * Phrase-aligned tuple class associating an F phrase, E phrase, and (possibly null)
+ * word-alignments. This is primarily for maintaining sentence-alignment.
+ *
+ * @author UMD (Jimmy Lin, Chris Dyer, et al.)
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @version $LastChangedDate$
+ */
+public class PhrasePair {
+ // Making these final requires Java6, not Java5
+ private final Phrase f;
+ private final Phrase e;
+ private final Alignment a;
+
+ // ===============================================================
+ // Constructors
+ // ===============================================================
+ public PhrasePair(Phrase f_, Phrase e_) {
+ this(f_, e_, null);
+ }
+
+ public PhrasePair(Phrase f, Phrase e, Alignment a) {
+ this.f = f;
+ this.e = e;
+ this.a = a;
+ }
+
+ // ===============================================================
+ // Attributes
+ // ===============================================================
+ public Phrase getF() {
+ return f;
+ }
+
+ public Phrase getE() {
+ return e;
+ }
+
+ public Alignment getAlignment() {
+ return a;
+ }
+
+ // ===============================================================
+ // Methods
+ // ===============================================================
+ public float ratioFtoE() {
+ return ((float) this.f.size()) / ((float) this.e.size());
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/subsample/PhraseReader.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/subsample/PhraseReader.java b/joshua-core/src/main/java/org/apache/joshua/subsample/PhraseReader.java
new file mode 100644
index 0000000..6db216f
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/subsample/PhraseReader.java
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.subsample;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.joshua.corpus.BasicPhrase;
+
+/**
+ * Wrapper class to read in each line as a BasicPhrase.
+ *
+ * @author UMD (Jimmy Lin, Chris Dyer, et al.)
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @version $LastChangedDate$
+ */
+public class PhraseReader extends BufferedReader {
+ private byte language;
+
+ public PhraseReader(Reader r, byte language) {
+ super(r);
+ this.language = language;
+ }
+
+ public BasicPhrase readPhrase() throws IOException {
+ String line = super.readLine();
+ return (line == null ? null : new BasicPhrase(this.language, line));
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/subsample/PhraseWriter.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/subsample/PhraseWriter.java b/joshua-core/src/main/java/org/apache/joshua/subsample/PhraseWriter.java
new file mode 100644
index 0000000..11bbf08
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/subsample/PhraseWriter.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.subsample;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+
+
+/**
+ * A PhrasePair-parallel BufferedWriter. In an ideal world we could get the compiler to inline all
+ * of this, to have zero-overhead while not duplicating code. Alas, Java's not that cool. The
+ * "final" could help on JIT at least.
+ *
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @version $LastChangedDate$
+ */
+final public class PhraseWriter {
+ // Making these final requires Java6, not Java5
+ private final BufferedWriter wf;
+ private final BufferedWriter we;
+ private final BufferedWriter wa;
+
+ // ===============================================================
+ // Constructors
+ // ===============================================================
+ public PhraseWriter(BufferedWriter wf_, BufferedWriter we_) {
+ this(wf_, we_, null);
+ }
+
+ public PhraseWriter(BufferedWriter wf, BufferedWriter we, BufferedWriter wa) {
+ this.wf = wf;
+ this.we = we;
+ this.wa = wa;
+ }
+
+
+ // ===============================================================
+ // Methods
+ // ===============================================================
+ public void write(PhrasePair pp) throws IOException {
+ this.wf.write(pp.getF().toString());
+ this.we.write(pp.getE().toString());
+ if (null != this.wa) this.wa.write(pp.getAlignment().toString());
+ }
+
+ public void newLine() throws IOException {
+ this.wf.newLine();
+ this.we.newLine();
+ if (null != this.wa) this.wa.newLine();
+ }
+
+ public void flush() throws IOException {
+ this.wf.flush();
+ this.we.flush();
+ if (null != this.wa) this.wa.flush();
+ }
+
+ public void close() throws IOException {
+ this.wf.close();
+ this.we.close();
+ if (null != this.wa) this.wa.close();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/subsample/Subsampler.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/subsample/Subsampler.java b/joshua-core/src/main/java/org/apache/joshua/subsample/Subsampler.java
new file mode 100644
index 0000000..36e1925
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/subsample/Subsampler.java
@@ -0,0 +1,246 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.subsample;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.joshua.corpus.BasicPhrase;
+import org.apache.joshua.corpus.Phrase;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A class for subsampling a large (F,E)-parallel sentence-aligned corpus to generate a smaller
+ * corpus whose N-grams are relevant to some seed corpus. The idea of subsampling owes to Kishore
+ * Papineni.
+ *
+ * @author UMD (Jimmy Lin, Chris Dyer, et al.)
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @version $LastChangedDate$
+ */
+public class Subsampler {
+
+ private static final Logger LOG = LoggerFactory.getLogger(Subsampler.class);
+
+ protected Map<Phrase, Integer> ngramCounts;
+ protected int maxN;
+ protected int targetCount;
+ protected int maxSubsample = 1500000;
+
+ protected static final int MAX_SENTENCE_LENGTH = 100;
+ protected static final int MIN_RATIO_LENGTH = 10;
+
+
+ public Subsampler(String[] testFiles, int maxN, int targetCount) throws IOException {
+ this.maxN = maxN;
+ this.targetCount = targetCount;
+ this.ngramCounts = loadNgrams(testFiles);
+ }
+
+ private HashMap<Phrase, Integer> loadNgrams(String[] files) throws IOException {
+ HashMap<Phrase, Integer> map = new HashMap<Phrase, Integer>();
+ for (String fn : files) {
+ LOG.debug("Loading test set from {}", fn);
+
+ PhraseReader reader = new PhraseReader(new FileReader(fn), (byte) 1);
+ Phrase phrase;
+ int lineCount = 0;
+ try {
+ while ((phrase = reader.readPhrase()) != null) {
+ lineCount++;
+ List<Phrase> ngrams = phrase.getSubPhrases(this.maxN);
+ for (Phrase ngram : ngrams)
+ map.put(ngram, 0);
+ }
+ } finally {
+ reader.close();
+ }
+ LOG.debug("Processed {} lines in {}", lineCount, fn);
+ }
+ LOG.debug("Test set: {} ngrams", map.size());
+ return map;
+ }
+
+
+ /**
+ * The general subsampler function for external use.
+ *
+ * @param filelist list of source files to subsample from
+ * @param targetFtoERatio goal for ratio of output F length to output E length
+ * @param extf extension of F files
+ * @param exte extension of E files
+ * @param fpath path to source F files
+ * @param epath path to source E files
+ * @param output basename for output files (will append extensions)
+ * @throws IOException if there is an issue reading one of the input files
+ */
+ public void subsample(String filelist, float targetFtoERatio, String extf, String exte,
+ String fpath, String epath, String output) throws IOException {
+ this.subsample(filelist, targetFtoERatio, new PhraseWriter(new BufferedWriter(
+ new OutputStreamWriter(new FileOutputStream(output + "." + extf), "UTF8")),
+ new BufferedWriter(
+ new OutputStreamWriter(new FileOutputStream(output + "." + exte), "UTF8"))),
+ new BiCorpusFactory(fpath, epath, null, extf, exte, null));
+ }
+
+ /**
+ * The main wrapper for the subsample worker. Closes the
+ * {@link org.apache.joshua.subsample.PhraseWriter} before exiting.
+ * @param filelist list of source files to subsample from
+ * @param targetFtoERatio goal for ratio of output F length to output E length
+ * @param out a {@link org.apache.joshua.subsample.PhraseWriter} to flush data to
+ * @param bcFactory used to generate a sentence-aligned {@link org.apache.joshua.subsample.BiCorpus}
+ * @throws IOException if there is an issue reading one of the input files
+ */
+ protected void subsample(String filelist, float targetFtoERatio, PhraseWriter out,
+ BiCorpusFactory bcFactory) throws IOException {
+ try {
+ // Read filenames into a list
+ List<String> files = new ArrayList<String>();
+ {
+ FileReader fr = null;
+ BufferedReader br = null;
+ try {
+ fr = new FileReader(filelist);
+ br = new BufferedReader(fr);
+ String file;
+ while ((file = br.readLine()) != null) {
+ files.add(file);
+ }
+ } finally {
+ // Maybe redundant, but UMD's FixBugs says to
+ // close br (and close is idempotent anyways)
+ if (null != fr) fr.close();
+ if (null != br) br.close();
+ }
+ }
+
+ int totalSubsampled = 0;
+ // Iterating on files in order biases towards files
+ // earlier in the list
+ for (String f : files) {
+ LOG.info("Loading training data: {}", f);
+
+ BiCorpus bc = bcFactory.fromFiles(f);
+
+ HashMap<PhrasePair, PhrasePair> set = new HashMap<PhrasePair, PhrasePair>();
+
+ int binsize = 10; // BUG: Magic-Number
+ int max_k = MAX_SENTENCE_LENGTH / binsize;
+ LOG.debug("Looking in length range");
+ // Iterating bins from small to large biases
+ // towards short sentences
+ for (int k = 0; k < max_k; k++) {
+ LOG.debug(" [{}, {}]", (k * binsize + 1), ((k + 1) * binsize));
+ this.subsample(set, bc, k * binsize + 1, (k + 1) * binsize, targetFtoERatio);
+
+ if (set.size() + totalSubsampled > maxSubsample) break;
+ }
+
+ float ff = 0.0f;
+ float ef = 0.0f;
+ for (PhrasePair pp : set.keySet()) {
+ // Get pp.ratioFtoE() for all pp
+ ff += pp.getF().size();
+ ef += pp.getE().size();
+
+ out.write(set.get(pp));
+ out.newLine();
+ }
+ out.flush();
+
+ totalSubsampled += set.size();
+ LOG.info("current={} [total={}] currentRatio={}", set.size(), totalSubsampled, (ff / ef));
+
+ // TODO: is this gc actually dubious? Or
+ // does profiling show it helps? We only
+ // do it once per file, so it's not a
+ // performance blackhole.
+ set = null;
+ bc = null;
+ System.gc();
+ }
+ } finally {
+ out.close();
+ }
+ }
+
+ /**
+ * The worker function for subsampling.
+ *
+ * @param set The set to put selected sentences into
+ * @param bc The sentence-aligned corpus to read from
+ * @param minLength The minimum F sentence length
+ * @param maxLength The maximum F sentence length
+ * @param targetFtoERatio The desired ratio of F length to E length
+ */
+ private void subsample(HashMap<PhrasePair, PhrasePair> set, BiCorpus bc, int minLength,
+ int maxLength, float targetFtoERatio) {
+ for (PhrasePair pp : bc) {
+ PhrasePair lowercase_pp =
+ new PhrasePair(new BasicPhrase((byte) 1, pp.getF().toString().toLowerCase()),
+ new BasicPhrase((byte) 1, pp.getE().toString().toLowerCase()), pp.getAlignment());
+
+ {
+ int eLength = pp.getE().size();
+ if (eLength == 0 || eLength > MAX_SENTENCE_LENGTH) continue;
+ }
+
+ int fLength = pp.getF().size();
+ if (fLength == 0 || fLength < minLength || fLength > maxLength
+ || fLength > MAX_SENTENCE_LENGTH) continue;
+ if (fLength > 10 && targetFtoERatio != 0.0f) {
+ float ratio = pp.ratioFtoE();
+ if (fLength >= MIN_RATIO_LENGTH
+ && (ratio > 1.3f * targetFtoERatio || ratio * 1.3f < targetFtoERatio)) continue;
+ }
+ if (set.containsKey(lowercase_pp)) continue;
+
+ // at this point, length checks out and the sentence hasn't
+ // been selected yet
+
+ List<Phrase> ngrams = pp.getF().getSubPhrases(this.maxN);
+ boolean useSentence = false;
+ for (Phrase ng : ngrams) {
+ Integer count = this.ngramCounts.get(ng);
+ if (count == null) continue;
+ if (count < targetCount) {
+ useSentence = true;
+ count++;
+ this.ngramCounts.put(ng, count);
+ }
+ }
+ if (useSentence) set.put(lowercase_pp, pp);
+ }
+ }
+
+
+ public static void main(String[] args) {
+ new SubsamplerCLI().runMain(args);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/subsample/SubsamplerCLI.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/subsample/SubsamplerCLI.java b/joshua-core/src/main/java/org/apache/joshua/subsample/SubsamplerCLI.java
new file mode 100644
index 0000000..5a287c3
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/subsample/SubsamplerCLI.java
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.subsample;
+
+import java.io.IOException;
+
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+
+
+/**
+ * This class defines a callback closure to allow "overriding" the main function in subclasses of
+ * {@link Subsampler}, without duplicating code. For all subclasses, CLI <code>Options</code> should
+ * be members of the class (so they're visible to <code>runSubsampler</code> as well as
+ * <code>getCliOptions</code>), the <code>getCliOptions</code> method should be overridden to add
+ * the additional options (via <code>super</code> to keep the old options), and the
+ * <code>runSubsampler</code> method should be overridden to do the primary work for main. The
+ * <code>runMain</code> method ties everything together and should not need modification. Due to the
+ * one-use nature of subclasses of <code>SubsampleCLI</code>, they generally should be implemented
+ * as anonymous local classes.
+ *
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @version $LastChangedDate$
+ */
+@SuppressWarnings("static-access")
+public class SubsamplerCLI {
+ // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+ protected final Option ot = OptionBuilder.withArgName("listfile").hasArg()
+ .withDescription("A file containing a list of training file basenames (what to sample from)")
+ .isRequired().create("training");
+
+ // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+ protected final Option otest = OptionBuilder.withArgName("file").hasArgs()
+ .withDescription("The test file (what to sample for)").isRequired().create("test");
+
+ // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+ protected final Option ooutput = OptionBuilder.withArgName("basename").hasArgs()
+ .withDescription("File basename for output training corpus").isRequired().create("output");
+
+ // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+ protected final Option of = OptionBuilder.withArgName("lang").hasArg()
+ .withDescription("Foreign language extension").isRequired().create("f");
+
+ // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+ protected final Option oe = OptionBuilder.withArgName("lang").hasArg()
+ .withDescription("Native language extension").isRequired().create("e");
+
+ // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+ protected final Option ofpath = OptionBuilder.withArgName("path").hasArg()
+ .withDescription("Directory containing foreign language files").create("fpath");
+
+ // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+ protected final Option oepath = OptionBuilder.withArgName("path").hasArg()
+ .withDescription("Directory containing native language files").create("epath");
+
+ // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+ protected final Option oratio = OptionBuilder.withArgName("ratio").hasArg()
+ .withDescription("Target F/E ratio").create("ratio");
+
+ /**
+ * Return all Options. The HelpFormatter will print them in sorted order, so it doesn't matter
+ * when we add them. Subclasses should override this method by adding more options.
+ * @return all of the {@link org.apache.commons.cli.Options}
+ */
+ public Options getCliOptions() {
+ return new Options().addOption(ot).addOption(otest).addOption(of).addOption(oe)
+ .addOption(ofpath).addOption(oepath).addOption(oratio).addOption(ooutput);
+ }
+
+ /**
+ * This method should be overridden to return the class used in
+ * {@link org.apache.joshua.subsample.SubsamplerCLI#runSubsampler(String[], int, int, float)}.
+ * @return the {@link org.apache.joshua.subsample.Subsampler} implementation
+ */
+ public String getClassName() {
+ return Subsampler.class.getName();
+ }
+
+ /**
+ * Callback to run the subsampler. This function needs access to the variables holding each
+ * Option, thus all this closure nonsense.
+ * @param testFiles a String array of test files
+ * @param maxN todo
+ * @param targetCount todo
+ * @param ratio todo
+ * @throws IOException if there is an issue whilst reading input files
+ */
+ public void runSubsampler(String[] testFiles, int maxN, int targetCount, float ratio)
+ throws IOException {
+ new Subsampler(testFiles, maxN, targetCount).subsample(ot.getValue(), ratio, of.getValue(),
+ oe.getValue(), ofpath.getValue(), oepath.getValue(), ooutput.getValue());
+ }
+
+ /**
+ * Non-static version of main so that we can define anonymous local classes to override or extend
+ * the above.
+ * @param args a String array of input options
+ */
+ public void runMain(String[] args) {
+ Options o = this.getCliOptions();
+ try {
+ new GnuParser().parse(o, args);
+ } catch (ParseException pe) {
+ // The message from pe is ugly, so we omit it.
+ System.err.println("Error parsing command line");
+ new HelpFormatter().printHelp(this.getClassName(), o);
+ System.exit(1);
+ }
+
+ try {
+ float ratio = 0.8f;
+ if (this.oratio.getValue() != null) {
+ ratio = Float.parseFloat(this.oratio.getValue());
+ }
+ this.runSubsampler(this.otest.getValues(), 12, 20, ratio);
+ } catch (Exception e) {
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/subsample/package-info.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/subsample/package-info.java b/joshua-core/src/main/java/org/apache/joshua/subsample/package-info.java
new file mode 100644
index 0000000..b7fe744
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/subsample/package-info.java
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/**
+ * Provides executables Subsampler and AlignedSubsampler,
+ * for subsampling from large training corpora based on a
+ * test corpus.
+ */
+package org.apache.joshua.subsample;
+