You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by le...@apache.org on 2016/05/16 06:26:19 UTC
[03/66] [partial] incubator-joshua git commit: JOSHUA-252 Make it
possible to use Maven to build Joshua
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/server/ServerThread.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/server/ServerThread.java b/src/main/java/org/apache/joshua/server/ServerThread.java
new file mode 100644
index 0000000..ac0390b
--- /dev/null
+++ b/src/main/java/org/apache/joshua/server/ServerThread.java
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package joshua.server;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.io.StringReader;
+import java.net.Socket;
+import java.net.SocketException;
+import java.net.URLDecoder;
+import java.nio.charset.Charset;
+import java.util.HashMap;
+
+import com.sun.net.httpserver.HttpExchange;
+import com.sun.net.httpserver.HttpHandler;
+
+import joshua.decoder.Decoder;
+import joshua.decoder.JoshuaConfiguration;
+import joshua.decoder.io.TranslationRequestStream;
+
+/**
+ * This class handles a concurrent request for translations from a newly opened socket.
+ */
+public class ServerThread extends Thread implements HttpHandler {
+ private static final Charset FILE_ENCODING = Charset.forName("UTF-8");
+
+ private final JoshuaConfiguration joshuaConfiguration;
+ private Socket socket = null;
+ private final Decoder decoder;
+
+ /**
+ * Creates a new TcpServerThread that can run a set of translations.
+ *
+ * @param socket the socket representing the input/output streams
+ * @param decoder the configured decoder that handles performing translations
+ */
+ public ServerThread(Socket socket, Decoder decoder, JoshuaConfiguration joshuaConfiguration) {
+ this.joshuaConfiguration = joshuaConfiguration;
+ this.socket = socket;
+ this.decoder = decoder;
+ }
+
+ /**
+ * Reads the input from the socket, submits the input to the decoder, transforms the resulting
+ * translations into the required output format, writes out the formatted output, then closes the
+ * socket.
+ */
+ @Override
+ public void run() {
+
+ try {
+ BufferedReader reader = new BufferedReader(new InputStreamReader(socket.getInputStream(), FILE_ENCODING));
+
+ TranslationRequestStream request = new TranslationRequestStream(reader, joshuaConfiguration);
+
+ try {
+ decoder.decodeAll(request, socket.getOutputStream());
+
+ } catch (SocketException e) {
+ System.err.println("* WARNING: Socket interrupted");
+ request.shutdown();
+ return;
+ }
+ reader.close();
+ socket.close();
+ } catch (IOException e) {
+ return;
+ }
+ }
+
+ public HashMap<String, String> queryToMap(String query){
+ HashMap<String, String> result = new HashMap<String, String>();
+ for (String param : query.split("&")) {
+ String pair[] = param.split("=");
+ if (pair.length > 1) {
+ result.put(pair[0], pair[1]);
+ } else {
+ result.put(pair[0], "");
+ }
+ }
+ return result;
+ }
+
+ private class HttpWriter extends OutputStream {
+
+ private HttpExchange client = null;
+ private OutputStream out = null;
+
+ public HttpWriter(HttpExchange client) {
+ this.client = client;
+ }
+
+ @Override
+ public void write(byte[] response) throws IOException {
+ client.sendResponseHeaders(200, response.length);
+ out = client.getResponseBody();
+ out.write(response);
+ out.close();
+ }
+
+ @Override
+ public void write(int b) throws IOException {
+ out.write(b);
+ }
+ }
+
+
+ @Override
+ public void handle(HttpExchange client) throws IOException {
+
+ HashMap<String, String> params = queryToMap(URLDecoder.decode(client.getRequestURI().getQuery(), "UTF-8"));
+ String query = params.get("q");
+
+ BufferedReader reader = new BufferedReader(new StringReader(query));
+ TranslationRequestStream request = new TranslationRequestStream(reader, joshuaConfiguration);
+
+ decoder.decodeAll(request, new HttpWriter(client));
+ reader.close();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/server/TcpServer.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/server/TcpServer.java b/src/main/java/org/apache/joshua/server/TcpServer.java
new file mode 100644
index 0000000..2b63e72
--- /dev/null
+++ b/src/main/java/org/apache/joshua/server/TcpServer.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package joshua.server;
+
+import java.net.*;
+import java.io.*;
+
+import joshua.decoder.Decoder;
+import joshua.decoder.JoshuaConfiguration;
+
+/**
+ * TCP/IP server. Accepts newline-separated input sentences written to the socket, translates them
+ * all, and writes the resulting translations back out to the socket.
+ */
+public class TcpServer {
+ private final JoshuaConfiguration joshuaConfiguration;
+ private Decoder decoder;
+ private int port;
+
+ public TcpServer(Decoder decoder, int port,JoshuaConfiguration joshuaConfiguration) {
+ this.joshuaConfiguration = joshuaConfiguration;
+ this.decoder = decoder;
+ this.port = port;
+ }
+
+ /**
+ * Listens on a port for new socket connections. Concurrently handles multiple socket connections.
+ *
+ * @param args configuration options
+ * @throws IOException
+ */
+ public void start() {
+
+ try {
+ ServerSocket serverSocket = new ServerSocket(joshuaConfiguration.server_port);
+ Decoder.LOG(1, String.format("** TCP Server running and listening on port %d.", port));
+
+ boolean listening = true;
+ while (listening)
+ new ServerThread(serverSocket.accept(), decoder, joshuaConfiguration).start();
+
+ serverSocket.close();
+
+ } catch (IOException e) {
+ System.err.println(String.format("Could not listen on port: %d.", joshuaConfiguration.server_port));
+ System.exit(-1);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/subsample/AlignedSubsampler.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/subsample/AlignedSubsampler.java b/src/main/java/org/apache/joshua/subsample/AlignedSubsampler.java
new file mode 100644
index 0000000..37480d7
--- /dev/null
+++ b/src/main/java/org/apache/joshua/subsample/AlignedSubsampler.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package joshua.subsample;
+
+import java.io.BufferedWriter;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+
+
+/**
+ * A subsampler which takes in word-alignments as well as the F and E files. To remove redundant
+ * code, this class uses callback techniques in order to "override" the superclass methods.
+ *
+ * @see joshua.subsample.Subsampler
+ * @author wren ng thornton <wr...@users.sourceforge.net>
+ * @version $LastChangedDate$
+ */
+public class AlignedSubsampler extends Subsampler {
+
+ public AlignedSubsampler(String[] testFiles, int maxN, int targetCount) throws IOException {
+ super(testFiles, maxN, targetCount);
+ }
+
+
+ /**
+ * @param filelist list of source files to subsample from
+ * @param targetFtoERatio goal for ratio of output F length to output E length
+ * @param extf extension of F files
+ * @param exte extension of E files
+ * @param exta extension of alignment files
+ * @param fpath path to source F files
+ * @param epath path to source E files
+ * @param apath path to source alignment files
+ * @param output basename for output files (will append extensions)
+ */
+ public void subsample(String filelist, float targetFtoERatio, String extf, String exte,
+ String exta, String fpath, String epath, String apath, String output) throws IOException {
+ this.subsample(filelist, targetFtoERatio, new PhraseWriter(new BufferedWriter(
+ new OutputStreamWriter(new FileOutputStream(output + "." + extf), "UTF8")),
+ new BufferedWriter(
+ new OutputStreamWriter(new FileOutputStream(output + "." + exte), "UTF8")),
+ new BufferedWriter(
+ new OutputStreamWriter(new FileOutputStream(output + "." + exta), "UTF8"))),
+ new BiCorpusFactory(fpath, epath, apath, extf, exte, exta) { /* Local class definition */
+ public BiCorpus fromFiles(String f) throws IOException {
+ return this.alignedFromFiles(f);
+ }
+ });
+ }
+
+
+ @SuppressWarnings("static-access")
+ public static void main(String[] args) {
+ new SubsamplerCLI() { /* Local class definition */
+
+ // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+ protected final Option oa = OptionBuilder.withArgName("lang").hasArg()
+ .withDescription("Word alignment extension").isRequired().create("a");
+
+ // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+ protected final Option oapath = OptionBuilder.withArgName("path").hasArg()
+ .withDescription("Directory containing word alignment files").create("apath");
+
+ public Options getCliOptions() {
+ return super.getCliOptions().addOption(oa).addOption(oapath);
+ }
+
+ public String getClassName() {
+ return AlignedSubsampler.class.getName();
+ }
+
+ public void runSubsampler(String[] testFiles, int maxN, int targetCount, float ratio)
+ throws IOException {
+ new AlignedSubsampler(testFiles, maxN, targetCount).subsample(ot.getValue(), ratio,
+ of.getValue(), oe.getValue(), oa.getValue(), ofpath.getValue(), oepath.getValue(),
+ oapath.getValue(), ooutput.getValue());
+ }
+
+ }.runMain(args);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/subsample/Alignment.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/subsample/Alignment.java b/src/main/java/org/apache/joshua/subsample/Alignment.java
new file mode 100644
index 0000000..9033a3e
--- /dev/null
+++ b/src/main/java/org/apache/joshua/subsample/Alignment.java
@@ -0,0 +1,84 @@
+/*
+ * This file is based on the edu.umd.clip.mt.Alignment class from the University of Maryland's
+ * umd-hadoop-mt-0.01 project. That project is released under the terms of the Apache License 2.0,
+ * but with special permission for the Joshua Machine Translation System to release modifications
+ * under the LGPL version 2.1. LGPL version 3 requires no special permission since it is compatible
+ * with Apache License 2.0
+ */
+package joshua.subsample;
+
+
+/**
+ * A set of word alignments between an F phrase and an E phrase. The implementation uses a
+ * two-dimensional bit vector, though for our purposes we could just keep the original string around
+ * (which would save lots of time parsing and reconstructing the string).
+ *
+ * @see joshua.corpus.alignment.Alignments
+ *
+ * @author UMD (Jimmy Lin, Chris Dyer, et al.)
+ * @author wren ng thornton <wr...@users.sourceforge.net>
+ * @version $LastChangedDate$
+ */
+public class Alignment {
+ private short eLength;
+ private short fLength;
+ private M2 aligned;
+
+ public Alignment(short fLength, short eLength, String alignments) {
+ this.eLength = eLength;
+ this.fLength = fLength;
+ this.aligned = new M2(fLength, eLength);
+
+ if (alignments == null || alignments.length() == 0) {
+ return;
+ }
+ String[] als = alignments.split("\\s+"); // TODO: joshua.util.Regex
+ for (String al : als) {
+ String[] pair = al.split("-");
+ if (pair.length != 2)
+ throw new IllegalArgumentException("Malformed alignment string: " + alignments);
+ short f = Short.parseShort(pair[0]);
+ short e = Short.parseShort(pair[1]);
+ if (f >= fLength || e >= eLength)
+ throw new IndexOutOfBoundsException("out of bounds: " + f + "," + e);
+ aligned.set(f, e);
+ }
+ }
+
+
+ public String toString() {
+ StringBuffer sb = new StringBuffer();
+ for (short i = 0; i < fLength; i++)
+ for (short j = 0; j < eLength; j++)
+ if (aligned.get(i, j)) sb.append(i).append('-').append(j).append(' ');
+
+ // Remove trailing space
+ if (sb.length() > 0) sb.delete(sb.length() - 1, sb.length());
+
+ return sb.toString();
+ }
+
+
+ /** A (short,short)->boolean map for storing alignments. */
+ private final static class M2 {
+ private short width;
+ private boolean[] bits;
+
+ public M2(short f, short e) {
+ width = f;
+ bits = new boolean[f * e];
+ }
+
+ public boolean get(short f, short e) {
+ return bits[width * e + f];
+ }
+
+ public void set(short f, short e) {
+ try {
+ bits[width * e + f] = true;
+ } catch (ArrayIndexOutOfBoundsException ee) {
+ throw new RuntimeException("Set(" + f + ", " + e + "): caught " + ee);
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/subsample/BiCorpus.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/subsample/BiCorpus.java b/src/main/java/org/apache/joshua/subsample/BiCorpus.java
new file mode 100644
index 0000000..83cba63
--- /dev/null
+++ b/src/main/java/org/apache/joshua/subsample/BiCorpus.java
@@ -0,0 +1,172 @@
+/*
+ * This file is based on the edu.umd.clip.mt.subsample.BiCorpus class from the University of
+ * Maryland's jmtTools project (in conjunction with the umd-hadoop-mt-0.01 project). That project is
+ * released under the terms of the Apache License 2.0, but with special permission for the Joshua
+ * Machine Translation System to release modifications under the LGPL version 2.1. LGPL version 3
+ * requires no special permission since it is compatible with Apache License 2.0
+ */
+package joshua.subsample;
+
+import java.io.BufferedReader;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+import joshua.corpus.Phrase;
+
+
+/**
+ * Class for representing a sentence-aligned bi-corpus (with optional word-alignments).
+ * <p>
+ * In order to avoid memory crashes we no longer extend an ArrayList, which tries to cache the
+ * entire file in memory at once. This means we'll re-read through each file (1 +
+ * {@link Subsampler#MAX_SENTENCE_LENGTH} / binsize) times where binsize is determined by the
+ * <code>subsample(String, float, PhraseWriter, BiCorpusFactory)</code> method.
+ *
+ * @author UMD (Jimmy Lin, Chris Dyer, et al.)
+ * @author wren ng thornton <wr...@users.sourceforge.net>
+ * @version $LastChangedDate$
+ */
+public class BiCorpus implements Iterable<PhrasePair> {
+ // Making these final requires Java6, doesn't work in Java5
+ protected final String foreignFileName;
+ protected final String nativeFileName;
+ protected final String alignmentFileName;
+
+ // ===============================================================
+ // Constructors
+ // ===============================================================
+ /**
+ * Constructor for unaligned BiCorpus.
+ */
+ public BiCorpus(String foreignFileName, String nativeFileName) throws IOException {
+ this(foreignFileName, nativeFileName, null);
+ }
+
+
+ /**
+ * Constructor for word-aligned BiCorpus.
+ */
+ public BiCorpus(String foreignFileName, String nativeFileName, String alignmentFileName)
+ throws IOException, IllegalArgumentException, IndexOutOfBoundsException {
+ this.foreignFileName = foreignFileName;
+ this.nativeFileName = nativeFileName;
+ this.alignmentFileName = alignmentFileName;
+
+ // Check for fileLengthMismatchException
+ // Of course, that will be checked for in each iteration
+ //
+ // We write it this way to avoid warnings from the foreach style loop
+ Iterator<PhrasePair> it = iterator();
+ while (it.hasNext()) {
+ it.next();
+ }
+ }
+
+
+ // ===============================================================
+ // Methods
+ // ===============================================================
+ // BUG: We don't close file handles. The other reader classes apparently have finalizers to handle
+ // this well enough for our purposes, but we should migrate to using joshua.util.io.LineReader and
+ // be sure to close it in the end.
+
+ // We're not allowed to throw exceptions from Iterator/Iterable
+ // so we have evil boilerplate to crash the system
+ /**
+ * Iterate through the files represented by this <code>BiCorpus</code>, returning a
+ * {@link PhrasePair} for each pair (or triple) of lines.
+ */
+ @SuppressWarnings("resource")
+ public Iterator<PhrasePair> iterator() {
+ PhraseReader closureRF = null;
+ PhraseReader closureRE = null;
+ BufferedReader closureRA = null;
+ try {
+ closureRF = new PhraseReader(new FileReader(this.foreignFileName), (byte) 1);
+ closureRE = new PhraseReader(new FileReader(this.nativeFileName), (byte) 0);
+ closureRA =
+ (null == this.alignmentFileName ? null : new BufferedReader(new FileReader(
+ this.alignmentFileName)));
+ } catch (FileNotFoundException e) {
+ throw new RuntimeException("File not found", e);
+ }
+ // Making final for closure capturing in the local class definition
+ final PhraseReader rf = closureRF;
+ final PhraseReader re = closureRE;
+ final BufferedReader ra = closureRA;
+
+ return new Iterator<PhrasePair>() { /* Local class definition */
+ private Phrase nextForeignPhrase = null;
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+
+ public boolean hasNext() {
+ if (null == this.nextForeignPhrase) {
+ try {
+ this.nextForeignPhrase = rf.readPhrase();
+ } catch (IOException e) {
+ throw new RuntimeException("IOException", e);
+ }
+ }
+ return null != this.nextForeignPhrase;
+ }
+
+ public PhrasePair next() {
+ if (this.hasNext()) {
+ Phrase f = this.nextForeignPhrase;
+
+ Phrase e = null;
+ try {
+ e = re.readPhrase();
+ } catch (IOException ioe) {
+ throw new RuntimeException("IOException", ioe);
+ }
+ if (null == e) {
+ fileLengthMismatchException();
+ return null; // Needed to make javac happy
+ } else {
+ if (e.size() != 0 && f.size() != 0) {
+ if (null != ra) {
+ String line = null;
+ try {
+ line = ra.readLine();
+ } catch (IOException ioe) {
+ throw new RuntimeException("IOException", ioe);
+ }
+
+ if (null == line) {
+ fileLengthMismatchException();
+ return null; // Needed to make javac happy
+ } else {
+ Alignment a = new Alignment((short) f.size(), (short) e.size(), line);
+
+ this.nextForeignPhrase = null;
+ return new PhrasePair(f, e, a);
+ }
+ } else {
+ this.nextForeignPhrase = null;
+ return new PhrasePair(f, e);
+ }
+ } else {
+ // Inverted while loop
+ this.nextForeignPhrase = null;
+ return this.next();
+ }
+ }
+ } else {
+ throw new NoSuchElementException();
+ }
+ }
+ }; /* End local class definition */
+ } /* end iterator() */
+
+
+ private static void fileLengthMismatchException() throws RuntimeException {
+ throw new RuntimeException("Mismatched file lengths!");
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/subsample/BiCorpusFactory.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/subsample/BiCorpusFactory.java b/src/main/java/org/apache/joshua/subsample/BiCorpusFactory.java
new file mode 100644
index 0000000..eea8937
--- /dev/null
+++ b/src/main/java/org/apache/joshua/subsample/BiCorpusFactory.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package joshua.subsample;
+
+import java.io.File;
+import java.io.IOException;
+
+
+/**
+ * A callback closure for <code>Subsampler.subsample</code>. This class is used by
+ * {@link AlignedSubsampler} in order to "override" methods of {@link Subsampler}, minimizing code
+ * duplication.
+ *
+ * @author wren ng thornton <wr...@users.sourceforge.net>
+ * @version $LastChangedDate$
+ */
+public class BiCorpusFactory {
+ // Making these final requires Java6, doesn't work in Java5
+ protected final String fpath;
+ protected final String epath;
+ protected final String apath;
+ protected final String extf;
+ protected final String exte;
+ protected final String exta;
+
+ public BiCorpusFactory(String fpath, String epath, String apath, String extf, String exte,
+ String exta) {
+ // The various concatenation has been moved up here
+ // to get it out of the loops where fromFiles is called.
+ this.fpath = (fpath == null ? "." : fpath) + File.separator;
+ this.epath = (epath == null ? "." : epath) + File.separator;
+ this.apath = (apath == null ? "." : apath) + File.separator;
+ this.extf = "." + extf;
+ this.exte = "." + exte;
+ this.exta = (exta == null ? null : "." + exta);
+ }
+
+
+ /** Generate unaligned BiCorpus by default. */
+ public BiCorpus fromFiles(String f) throws IOException {
+ return this.unalignedFromFiles(f);
+ }
+
+ /** Generate unaligned BiCorpus. */
+ public BiCorpus unalignedFromFiles(String f) throws IOException {
+ return new BiCorpus(fpath + f + extf, epath + f + exte);
+ }
+
+ /** Generate aligned BiCorpus. */
+ public BiCorpus alignedFromFiles(String f) throws IOException {
+ return new BiCorpus(fpath + f + extf, epath + f + exte, apath + f + exta);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/subsample/PhrasePair.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/subsample/PhrasePair.java b/src/main/java/org/apache/joshua/subsample/PhrasePair.java
new file mode 100644
index 0000000..36a1da5
--- /dev/null
+++ b/src/main/java/org/apache/joshua/subsample/PhrasePair.java
@@ -0,0 +1,64 @@
+/*
+ * This file is based on the edu.umd.clip.mt.PhrasePair class from the University of Maryland's
+ * umd-hadoop-mt-0.01 project. That project is released under the terms of the Apache License 2.0,
+ * but with special permission for the Joshua Machine Translation System to release modifications
+ * under the LGPL version 2.1. LGPL version 3 requires no special permission since it is compatible
+ * with Apache License 2.0
+ */
+package joshua.subsample;
+
+// TODO: if we generalize the Alignment class, we could move this
+// to joshua.util.sentence.
+
+import joshua.corpus.Phrase;
+
+
+/**
+ * Phrase-aligned tuple class associating an F phrase, E phrase, and (possibly null)
+ * word-alignments. This is primarily for maintaining sentence-alignment.
+ *
+ * @author UMD (Jimmy Lin, Chris Dyer, et al.)
+ * @author wren ng thornton <wr...@users.sourceforge.net>
+ * @version $LastChangedDate$
+ */
+public class PhrasePair {
+ // Making these final requires Java6, not Java5
+ private final Phrase f;
+ private final Phrase e;
+ private final Alignment a;
+
+ // ===============================================================
+ // Constructors
+ // ===============================================================
+ public PhrasePair(Phrase f_, Phrase e_) {
+ this(f_, e_, null);
+ }
+
+ public PhrasePair(Phrase f, Phrase e, Alignment a) {
+ this.f = f;
+ this.e = e;
+ this.a = a;
+ }
+
+ // ===============================================================
+ // Attributes
+ // ===============================================================
+ public Phrase getF() {
+ return f;
+ }
+
+ public Phrase getE() {
+ return e;
+ }
+
+ public Alignment getAlignment() {
+ return a;
+ }
+
+ // ===============================================================
+ // Methods
+ // ===============================================================
+ public float ratioFtoE() {
+ return ((float) this.f.size()) / ((float) this.e.size());
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/subsample/PhraseReader.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/subsample/PhraseReader.java b/src/main/java/org/apache/joshua/subsample/PhraseReader.java
new file mode 100644
index 0000000..f6dd6d3
--- /dev/null
+++ b/src/main/java/org/apache/joshua/subsample/PhraseReader.java
@@ -0,0 +1,36 @@
+/*
+ * This file is based on the edu.umd.clip.mt.PhraseReader class from the University of Maryland's
+ * umd-hadoop-mt-0.01 project. That project is released under the terms of the Apache License 2.0,
+ * but with special permission for the Joshua Machine Translation System to release modifications
+ * under the LGPL version 2.1. LGPL version 3 requires no special permission since it is compatible
+ * with Apache License 2.0
+ */
+package joshua.subsample;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Reader;
+
+import joshua.corpus.BasicPhrase;
+
+
+/**
+ * Wrapper class to read in each line as a BasicPhrase.
+ *
+ * @author UMD (Jimmy Lin, Chris Dyer, et al.)
+ * @author wren ng thornton <wr...@users.sourceforge.net>
+ * @version $LastChangedDate$
+ */
+public class PhraseReader extends BufferedReader {
+ private byte language;
+
+ public PhraseReader(Reader r, byte language) {
+ super(r);
+ this.language = language;
+ }
+
+ public BasicPhrase readPhrase() throws IOException {
+ String line = super.readLine();
+ return (line == null ? null : new BasicPhrase(this.language, line));
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/subsample/PhraseWriter.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/subsample/PhraseWriter.java b/src/main/java/org/apache/joshua/subsample/PhraseWriter.java
new file mode 100644
index 0000000..16a3563
--- /dev/null
+++ b/src/main/java/org/apache/joshua/subsample/PhraseWriter.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package joshua.subsample;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+
+
+/**
+ * A PhrasePair-parallel BufferedWriter. In an ideal world we could get the compiler to inline all
+ * of this, to have zero-overhead while not duplicating code. Alas, Java's not that cool. The
+ * "final" could help on JIT at least.
+ *
+ * @author wren ng thornton <wr...@users.sourceforge.net>
+ * @version $LastChangedDate$
+ */
+final public class PhraseWriter {
+ // Making these final requires Java6, not Java5
+ private final BufferedWriter wf;
+ private final BufferedWriter we;
+ private final BufferedWriter wa;
+
+ // ===============================================================
+ // Constructors
+ // ===============================================================
+ public PhraseWriter(BufferedWriter wf_, BufferedWriter we_) {
+ this(wf_, we_, null);
+ }
+
+ public PhraseWriter(BufferedWriter wf, BufferedWriter we, BufferedWriter wa) {
+ this.wf = wf;
+ this.we = we;
+ this.wa = wa;
+ }
+
+
+ // ===============================================================
+ // Methods
+ // ===============================================================
+ public void write(PhrasePair pp) throws IOException {
+ this.wf.write(pp.getF().toString());
+ this.we.write(pp.getE().toString());
+ if (null != this.wa) this.wa.write(pp.getAlignment().toString());
+ }
+
+ public void newLine() throws IOException {
+ this.wf.newLine();
+ this.we.newLine();
+ if (null != this.wa) this.wa.newLine();
+ }
+
+ public void flush() throws IOException {
+ this.wf.flush();
+ this.we.flush();
+ if (null != this.wa) this.wa.flush();
+ }
+
+ public void close() throws IOException {
+ this.wf.close();
+ this.we.close();
+ if (null != this.wa) this.wa.close();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/subsample/Subsampler.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/subsample/Subsampler.java b/src/main/java/org/apache/joshua/subsample/Subsampler.java
new file mode 100644
index 0000000..49e1a16
--- /dev/null
+++ b/src/main/java/org/apache/joshua/subsample/Subsampler.java
@@ -0,0 +1,228 @@
+/*
+ * This file is based on the edu.umd.clip.mt.subsample.Subsampler class from the University of
+ * Maryland's jmtTools project (in conjunction with the umd-hadoop-mt-0.01 project). That project is
+ * released under the terms of the Apache License 2.0, but with special permission for the Joshua
+ * Machine Translation System to release modifications under the LGPL version 2.1. LGPL version 3
+ * requires no special permission since it is compatible with Apache License 2.0
+ */
+package joshua.subsample;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import joshua.corpus.BasicPhrase;
+import joshua.corpus.Phrase;
+
+
+/**
+ * A class for subsampling a large (F,E)-parallel sentence-aligned corpus to generate a smaller
+ * corpus whose N-grams are relevant to some seed corpus. The idea of subsampling owes to Kishore
+ * Papineni.
+ *
+ * @author UMD (Jimmy Lin, Chris Dyer, et al.)
+ * @author wren ng thornton <wr...@users.sourceforge.net>
+ * @version $LastChangedDate$
+ */
+public class Subsampler {
+ protected Map<Phrase, Integer> ngramCounts;
+ protected int maxN;
+ protected int targetCount;
+ protected int maxSubsample = 1500000;
+
+ protected static final int MAX_SENTENCE_LENGTH = 100;
+ protected static final int MIN_RATIO_LENGTH = 10;
+
+
+ public Subsampler(String[] testFiles, int maxN, int targetCount) throws IOException {
+ this.maxN = maxN;
+ this.targetCount = targetCount;
+ this.ngramCounts = loadNgrams(testFiles);
+ }
+
+ private HashMap<Phrase, Integer> loadNgrams(String[] files) throws IOException {
+ HashMap<Phrase, Integer> map = new HashMap<Phrase, Integer>();
+ for (String fn : files) {
+ System.err.println("Loading test set from " + fn + "...");
+
+ PhraseReader reader = new PhraseReader(new FileReader(fn), (byte) 1);
+ Phrase phrase;
+ int lineCount = 0;
+ try {
+ while ((phrase = reader.readPhrase()) != null) {
+ lineCount++;
+ List<Phrase> ngrams = phrase.getSubPhrases(this.maxN);
+ for (Phrase ngram : ngrams)
+ map.put(ngram, 0);
+ }
+ } finally {
+ reader.close();
+ }
+ System.err.println("Processed " + lineCount + " lines in " + fn);
+ }
+ System.err.println("Test set: " + map.size() + " ngrams");
+ return map;
+ }
+
+
+ /**
+ * The general subsampler function for external use.
+ *
+ * @param filelist list of source files to subsample from
+ * @param targetFtoERatio goal for ratio of output F length to output E length
+ * @param extf extension of F files
+ * @param exte extension of E files
+ * @param fpath path to source F files
+ * @param epath path to source E files
+ * @param output basename for output files (will append extensions)
+ */
+ public void subsample(String filelist, float targetFtoERatio, String extf, String exte,
+ String fpath, String epath, String output) throws IOException {
+ this.subsample(filelist, targetFtoERatio, new PhraseWriter(new BufferedWriter(
+ new OutputStreamWriter(new FileOutputStream(output + "." + extf), "UTF8")),
+ new BufferedWriter(
+ new OutputStreamWriter(new FileOutputStream(output + "." + exte), "UTF8"))),
+ new BiCorpusFactory(fpath, epath, null, extf, exte, null));
+ }
+
+ /**
+ * The main wrapper for the subsample worker. Closes the PhraseWriter before exiting.
+ */
+ protected void subsample(String filelist, float targetFtoERatio, PhraseWriter out,
+ BiCorpusFactory bcFactory) throws IOException {
+ try {
+ // Read filenames into a list
+ List<String> files = new ArrayList<String>();
+ {
+ FileReader fr = null;
+ BufferedReader br = null;
+ try {
+ fr = new FileReader(filelist);
+ br = new BufferedReader(fr);
+ String file;
+ while ((file = br.readLine()) != null) {
+ files.add(file);
+ }
+ } finally {
+ // Maybe redundant, but UMD's FixBugs says to
+ // close br (and close is idempotent anyways)
+ if (null != fr) fr.close();
+ if (null != br) br.close();
+ }
+ }
+
+ int totalSubsampled = 0;
+ // Iterating on files in order biases towards files
+ // earlier in the list
+ for (String f : files) {
+ System.err.println("Loading training data: " + f);
+
+ BiCorpus bc = bcFactory.fromFiles(f);
+
+ HashMap<PhrasePair, PhrasePair> set = new HashMap<PhrasePair, PhrasePair>();
+
+ int binsize = 10; // BUG: Magic-Number
+ int max_k = MAX_SENTENCE_LENGTH / binsize;
+ System.err.print("Looking in length range");
+ // Iterating bins from small to large biases
+ // towards short sentences
+ for (int k = 0; k < max_k; k++) {
+ System.err.print(" [" + (k * binsize + 1) + "," + ((k + 1) * binsize) + "]");
+ System.err.flush();
+
+ this.subsample(set, bc, k * binsize + 1, (k + 1) * binsize, targetFtoERatio);
+
+ if (set.size() + totalSubsampled > maxSubsample) break;
+ }
+
+ float ff = 0.0f;
+ float ef = 0.0f;
+ for (PhrasePair pp : set.keySet()) {
+ // Get pp.ratioFtoE() for all pp
+ ff += pp.getF().size();
+ ef += pp.getE().size();
+
+ out.write(set.get(pp));
+ out.newLine();
+ }
+ out.flush();
+
+ totalSubsampled += set.size();
+ System.err.println("\n current=" + set.size() + " [total=" + totalSubsampled
+ + "] currentRatio=" + (ff / ef));
+ System.err.flush();
+
+ // TODO: is this gc actually dubious? Or
+ // does profiling show it helps? We only
+ // do it once per file, so it's not a
+ // performance blackhole.
+ set = null;
+ bc = null;
+ System.gc();
+ }
+ } finally {
+ out.close();
+ }
+ }
+
+ /**
+ * The worker function for subsampling.
+ *
+ * @param set The set to put selected sentences into
+ * @param bc The sentence-aligned corpus to read from
+ * @param minLength The minimum F sentence length
+ * @param maxLength The maximum F sentence length
+ * @param targetFtoERatio The desired ratio of F length to E length
+ */
+ private void subsample(HashMap<PhrasePair, PhrasePair> set, BiCorpus bc, int minLength,
+ int maxLength, float targetFtoERatio) {
+ for (PhrasePair pp : bc) {
+ PhrasePair lowercase_pp =
+ new PhrasePair(new BasicPhrase((byte) 1, pp.getF().toString().toLowerCase()),
+ new BasicPhrase((byte) 1, pp.getE().toString().toLowerCase()), pp.getAlignment());
+
+ {
+ int eLength = pp.getE().size();
+ if (eLength == 0 || eLength > MAX_SENTENCE_LENGTH) continue;
+ }
+
+ int fLength = pp.getF().size();
+ if (fLength == 0 || fLength < minLength || fLength > maxLength
+ || fLength > MAX_SENTENCE_LENGTH) continue;
+ if (fLength > 10 && targetFtoERatio != 0.0f) {
+ float ratio = pp.ratioFtoE();
+ if (fLength >= MIN_RATIO_LENGTH
+ && (ratio > 1.3f * targetFtoERatio || ratio * 1.3f < targetFtoERatio)) continue;
+ }
+ if (set.containsKey(lowercase_pp)) continue;
+
+ // at this point, length checks out and the sentence hasn't
+ // been selected yet
+
+ List<Phrase> ngrams = pp.getF().getSubPhrases(this.maxN);
+ boolean useSentence = false;
+ for (Phrase ng : ngrams) {
+ Integer count = this.ngramCounts.get(ng);
+ if (count == null) continue;
+ if (count < targetCount) {
+ useSentence = true;
+ count++;
+ this.ngramCounts.put(ng, count);
+ }
+ }
+ if (useSentence) set.put(lowercase_pp, pp);
+ }
+ }
+
+
+ public static void main(String[] args) {
+ new SubsamplerCLI().runMain(args);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/subsample/SubsamplerCLI.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/subsample/SubsamplerCLI.java b/src/main/java/org/apache/joshua/subsample/SubsamplerCLI.java
new file mode 100644
index 0000000..ad80b74
--- /dev/null
+++ b/src/main/java/org/apache/joshua/subsample/SubsamplerCLI.java
@@ -0,0 +1,121 @@
+/*
+ * This file uses code from the edu.umd.clip.mt.subsample.Subsampler class from the University of
+ * Maryland's jmtTools project (in conjunction with the umd-hadoop-mt-0.01 project). That project is
+ * released under the terms of the Apache License 2.0, but with special permission for the Joshua
+ * Machine Translation System to release modifications under the LGPL version 2.1. LGPL version 3
+ * requires no special permission since it is compatible with Apache License 2.0
+ */
+package joshua.subsample;
+
+import java.io.IOException;
+
+import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Options;
+import org.apache.commons.cli.ParseException;
+
+
+/**
+ * This class defines a callback closure to allow "overriding" the main function in subclasses of
+ * {@link Subsampler}, without duplicating code. For all subclasses, CLI <code>Options</code> should
+ * be members of the class (so they're visible to <code>runSubsampler</code> as well as
+ * <code>getCliOptions</code>), the <code>getCliOptions</code> method should be overridden to add
+ * the additional options (via <code>super</code> to keep the old options), and the
+ * <code>runSubsampler</code> method should be overridden to do the primary work for main. The
+ * <code>runMain</code> method ties everything together and should not need modification. Due to the
+ * one-use nature of subclasses of <code>SubsampleCLI</code>, they generally should be implemented
+ * as anonymous local classes.
+ *
+ * @author wren ng thornton <wr...@users.sourceforge.net>
+ * @version $LastChangedDate$
+ */
+@SuppressWarnings("static-access")
+public class SubsamplerCLI {
+ // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+ protected final Option ot = OptionBuilder.withArgName("listfile").hasArg()
+ .withDescription("A file containing a list of training file basenames (what to sample from)")
+ .isRequired().create("training");
+
+ // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+ protected final Option otest = OptionBuilder.withArgName("file").hasArgs()
+ .withDescription("The test file (what to sample for)").isRequired().create("test");
+
+ // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+ protected final Option ooutput = OptionBuilder.withArgName("basename").hasArgs()
+ .withDescription("File basename for output training corpus").isRequired().create("output");
+
+ // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+ protected final Option of = OptionBuilder.withArgName("lang").hasArg()
+ .withDescription("Foreign language extension").isRequired().create("f");
+
+ // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+ protected final Option oe = OptionBuilder.withArgName("lang").hasArg()
+ .withDescription("Native language extension").isRequired().create("e");
+
+ // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+ protected final Option ofpath = OptionBuilder.withArgName("path").hasArg()
+ .withDescription("Directory containing foreign language files").create("fpath");
+
+ // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+ protected final Option oepath = OptionBuilder.withArgName("path").hasArg()
+ .withDescription("Directory containing native language files").create("epath");
+
+ // TODO hasArg is a static method. It should be accessed as OptionBuilder.hasArg()
+ protected final Option oratio = OptionBuilder.withArgName("ratio").hasArg()
+ .withDescription("Target F/E ratio").create("ratio");
+
+ /**
+ * Return all Options. The HelpFormatter will print them in sorted order, so it doesn't matter
+ * when we add them. Subclasses should override this method by adding more options.
+ */
+ public Options getCliOptions() {
+ return new Options().addOption(ot).addOption(otest).addOption(of).addOption(oe)
+ .addOption(ofpath).addOption(oepath).addOption(oratio).addOption(ooutput);
+ }
+
+ /**
+ * This method should be overridden to return the class used in runSubsampler.
+ */
+ public String getClassName() {
+ return Subsampler.class.getName();
+ }
+
+ /**
+ * Callback to run the subsampler. This function needs access to the variables holding each
+ * Option, thus all this closure nonsense.
+ */
+ public void runSubsampler(String[] testFiles, int maxN, int targetCount, float ratio)
+ throws IOException {
+ new Subsampler(testFiles, maxN, targetCount).subsample(ot.getValue(), ratio, of.getValue(),
+ oe.getValue(), ofpath.getValue(), oepath.getValue(), ooutput.getValue());
+ }
+
+ /**
+ * Non-static version of main so that we can define anonymous local classes to override or extend
+ * the above.
+ */
+ public void runMain(String[] args) {
+ Options o = this.getCliOptions();
+ try {
+ new GnuParser().parse(o, args);
+ } catch (ParseException pe) {
+ // The message from pe is ugly, so we omit it.
+ System.err.println("Error parsing command line");
+ new HelpFormatter().printHelp(this.getClassName(), o);
+ System.exit(1);
+ }
+
+ try {
+ float ratio = 0.8f;
+ if (this.oratio.getValue() != null) {
+ ratio = Float.parseFloat(this.oratio.getValue());
+ }
+ this.runSubsampler(this.otest.getValues(), 12, 20, ratio);
+ } catch (Exception e) {
+ e.printStackTrace();
+ System.exit(1);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/subsample/package.html
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/subsample/package.html b/src/main/java/org/apache/joshua/subsample/package.html
new file mode 100644
index 0000000..bed439c
--- /dev/null
+++ b/src/main/java/org/apache/joshua/subsample/package.html
@@ -0,0 +1,25 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
+<html>
+<head></head>
+<body bgcolor="white">
+
+<!--
+##### THIS IS THE TEMPLATE FOR THE PACKAGE DOC COMMENTS. #####
+##### TYPE YOUR PACKAGE COMMENTS HERE. BEGIN WITH A #####
+##### ONE-SENTENCE SUMMARY STARTING WITH A VERB LIKE: #####
+-->
+
+Provides executables Subsampler and AlignedSubsampler, for subsampling from large training corpora based on a test corpus.
+
+<!--
+<h2>Related Documentation</h2>
+
+<ul>
+ <li>Much of the code in this package is based on .....
+</ul>
+-->
+
+<!-- Put @see and @since tags down here. -->
+
+</body>
+</html>
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/main/java/org/apache/joshua/tools/GrammarPacker.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/tools/GrammarPacker.java b/src/main/java/org/apache/joshua/tools/GrammarPacker.java
new file mode 100644
index 0000000..33d3391
--- /dev/null
+++ b/src/main/java/org/apache/joshua/tools/GrammarPacker.java
@@ -0,0 +1,983 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package joshua.tools;
+
+import static joshua.decoder.ff.tm.packed.PackedGrammar.VOCABULARY_FILENAME;
+
+import java.io.BufferedOutputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Queue;
+import java.util.TreeMap;
+import java.util.logging.Logger;
+
+import joshua.corpus.Vocabulary;
+import joshua.util.FormatUtils;
+import joshua.util.encoding.EncoderConfiguration;
+import joshua.util.encoding.FeatureTypeAnalyzer;
+import joshua.util.encoding.IntEncoder;
+import joshua.util.io.LineReader;
+
+public class GrammarPacker {
+
+ private static final Logger logger = Logger.getLogger(GrammarPacker.class.getName());
+
+ // Size limit for slice in bytes.
+ private static int DATA_SIZE_LIMIT = (int) (Integer.MAX_VALUE * 0.8);
+ // Estimated average number of feature entries for one rule.
+ private static int DATA_SIZE_ESTIMATE = 20;
+
+ private static final String SOURCE_WORDS_SEPARATOR = " ||| ";
+
+ // Output directory name.
+ private String output;
+
+ // Input grammar to be packed.
+ private String grammar;
+
+ public String getGrammar() {
+ return grammar;
+ }
+
+ public String getOutputDirectory() {
+ return output;
+ }
+
+ // Approximate maximum size of a slice in number of rules
+ private int approximateMaximumSliceSize;
+
+ private boolean labeled;
+
+ private boolean packAlignments;
+ private boolean grammarAlignments;
+ private String alignments;
+
+ private FeatureTypeAnalyzer types;
+ private EncoderConfiguration encoderConfig;
+
+ private String dump;
+
+ private int max_source_len;
+
+ public GrammarPacker(String grammar_filename, String config_filename, String output_filename,
+ String alignments_filename, String featuredump_filename, boolean grammar_alignments,
+ int approximateMaximumSliceSize)
+ throws IOException {
+ this.labeled = true;
+ this.grammar = grammar_filename;
+ this.output = output_filename;
+ this.dump = featuredump_filename;
+ this.grammarAlignments = grammar_alignments;
+ this.approximateMaximumSliceSize = approximateMaximumSliceSize;
+ this.max_source_len = 0;
+
+ // TODO: Always open encoder config? This is debatable.
+ this.types = new FeatureTypeAnalyzer(true);
+
+ this.alignments = alignments_filename;
+ packAlignments = grammarAlignments || (alignments != null);
+ if (!packAlignments) {
+ logger.info("No alignments file or grammar specified, skipping.");
+ } else if (alignments != null && !new File(alignments_filename).exists()) {
+ logger.severe("Alignments file does not exist: " + alignments);
+ System.exit(1);
+ }
+
+ if (config_filename != null) {
+ readConfig(config_filename);
+ types.readConfig(config_filename);
+ } else {
+ logger.info("No config specified. Attempting auto-detection of feature types.");
+ }
+ logger.info(String.format("Approximate maximum slice size (in # of rules) set to %s", approximateMaximumSliceSize));
+
+ File working_dir = new File(output);
+ working_dir.mkdir();
+ if (!working_dir.exists()) {
+ logger.severe("Failed creating output directory.");
+ System.exit(1);
+ }
+ }
+
+ private void readConfig(String config_filename) throws IOException {
+ LineReader reader = new LineReader(config_filename);
+ while (reader.hasNext()) {
+ // Clean up line, chop comments off and skip if the result is empty.
+ String line = reader.next().trim();
+ if (line.indexOf('#') != -1)
+ line = line.substring(0, line.indexOf('#'));
+ if (line.isEmpty())
+ continue;
+ String[] fields = line.split("[\\s]+");
+
+ if (fields.length < 2) {
+ logger.severe("Incomplete line in config.");
+ System.exit(1);
+ }
+ if ("slice_size".equals(fields[0])) {
+ // Number of records to concurrently load into memory for sorting.
+ approximateMaximumSliceSize = Integer.parseInt(fields[1]);
+ }
+ }
+ reader.close();
+ }
+
+ /**
+ * Executes the packing.
+ *
+ * @throws IOException
+ */
+ public void pack() throws IOException {
+ logger.info("Beginning exploration pass.");
+ LineReader grammar_reader = null;
+ LineReader alignment_reader = null;
+
+ // Explore pass. Learn vocabulary and feature value histograms.
+ logger.info("Exploring: " + grammar);
+ grammar_reader = new LineReader(grammar);
+ explore(grammar_reader);
+
+ logger.info("Exploration pass complete. Freezing vocabulary and finalizing encoders.");
+ if (dump != null) {
+ PrintWriter dump_writer = new PrintWriter(dump);
+ dump_writer.println(types.toString());
+ dump_writer.close();
+ }
+
+ types.inferTypes(this.labeled);
+ logger.info("Type inference complete.");
+
+ logger.info("Finalizing encoding.");
+
+ logger.info("Writing encoding.");
+ types.write(output + File.separator + "encoding");
+
+ writeVocabulary();
+
+ String configFile = output + File.separator + "config";
+ logger.info(String.format("Writing config to '%s'", configFile));
+ // Write config options
+ FileWriter config = new FileWriter(configFile);
+ config.write(String.format("max-source-len = %d\n", max_source_len));
+ config.close();
+
+ // Read previously written encoder configuration to match up to changed
+ // vocabulary id's.
+ logger.info("Reading encoding.");
+ encoderConfig = new EncoderConfiguration();
+ encoderConfig.load(output + File.separator + "encoding");
+
+ logger.info("Beginning packing pass.");
+ // Actual binarization pass. Slice and pack source, target and data.
+ grammar_reader = new LineReader(grammar);
+
+ if (packAlignments && !grammarAlignments)
+ alignment_reader = new LineReader(alignments);
+ binarize(grammar_reader, alignment_reader);
+ logger.info("Packing complete.");
+
+ logger.info("Packed grammar in: " + output);
+ logger.info("Done.");
+ }
+
+ private void explore(LineReader grammar) {
+ int counter = 0;
+ // We always assume a labeled grammar. Unlabeled features are assumed to be dense and to always
+ // appear in the same order. They are assigned numeric names in order of appearance.
+ this.types.setLabeled(true);
+
+ while (grammar.hasNext()) {
+ String line = grammar.next().trim();
+ counter++;
+ ArrayList<String> fields = new ArrayList<String>(Arrays.asList(line.split("\\s\\|{3}\\s")));
+
+ String lhs = null;
+ if (line.startsWith("[")) {
+ // hierarchical model
+ if (fields.size() < 4) {
+ logger.warning(String.format("Incomplete grammar line at line %d: '%s'", counter, line));
+ continue;
+ }
+ lhs = fields.remove(0);
+ } else {
+ // phrase-based model
+ if (fields.size() < 3) {
+ logger.warning("Incomplete phrase line at line " + counter);
+ logger.warning(line);
+ continue;
+ }
+ lhs = "[X]";
+ }
+
+ String[] source = fields.get(0).split("\\s");
+ String[] target = fields.get(1).split("\\s");
+ String[] features = fields.get(2).split("\\s");
+
+ max_source_len = Math.max(max_source_len, source.length);
+
+ Vocabulary.id(lhs);
+ try {
+ /* Add symbols to vocabulary.
+ * NOTE: In case of nonterminals, we add both stripped versions ("[X]")
+ * and "[X,1]" to the vocabulary.
+ */
+ for (String source_word : source) {
+ Vocabulary.id(source_word);
+ if (FormatUtils.isNonterminal(source_word)) {
+ Vocabulary.id(FormatUtils.stripNonTerminalIndex(source_word));
+ }
+ }
+ for (String target_word : target) {
+ Vocabulary.id(target_word);
+ if (FormatUtils.isNonterminal(target_word)) {
+ Vocabulary.id(FormatUtils.stripNonTerminalIndex(target_word));
+ }
+ }
+ } catch (java.lang.StringIndexOutOfBoundsException e) {
+ System.err.println(String.format("* Skipping bad grammar line '%s'", line));
+ continue;
+ }
+
+ // Add feature names to vocabulary and pass the value through the
+ // appropriate encoder.
+ int feature_counter = 0;
+ for (int f = 0; f < features.length; ++f) {
+ if (features[f].contains("=")) {
+ String[] fe = features[f].split("=");
+ if (fe[0].equals("Alignment"))
+ continue;
+ types.observe(Vocabulary.id(fe[0]), Float.parseFloat(fe[1]));
+ } else {
+ types.observe(Vocabulary.id(String.valueOf(feature_counter++)),
+ Float.parseFloat(features[f]));
+ }
+ }
+ }
+ }
+
+ /**
+ * Returns a String encoding the first two source words.
+ * If there is only one source word, use empty string for the second.
+ */
+ private String getFirstTwoSourceWords(final String[] source_words) {
+ return source_words[0] + SOURCE_WORDS_SEPARATOR + ((source_words.length > 1) ? source_words[1] : "");
+ }
+
+ private void binarize(LineReader grammar_reader, LineReader alignment_reader) throws IOException {
+ int counter = 0;
+ int slice_counter = 0;
+ int num_slices = 0;
+
+ boolean ready_to_flush = false;
+ // to determine when flushing is possible
+ String prev_first_two_source_words = null;
+
+ PackingTrie<SourceValue> source_trie = new PackingTrie<SourceValue>();
+ PackingTrie<TargetValue> target_trie = new PackingTrie<TargetValue>();
+ FeatureBuffer feature_buffer = new FeatureBuffer();
+
+ AlignmentBuffer alignment_buffer = null;
+ if (packAlignments)
+ alignment_buffer = new AlignmentBuffer();
+
+ TreeMap<Integer, Float> features = new TreeMap<Integer, Float>();
+ while (grammar_reader.hasNext()) {
+ String grammar_line = grammar_reader.next().trim();
+ counter++;
+ slice_counter++;
+
+ ArrayList<String> fields = new ArrayList<String>(Arrays.asList(grammar_line.split("\\s\\|{3}\\s")));
+ String lhs_word;
+ String[] source_words;
+ String[] target_words;
+ String[] feature_entries;
+ if (grammar_line.startsWith("[")) {
+ if (fields.size() < 4)
+ continue;
+
+ lhs_word = fields.remove(0);
+ source_words = fields.get(0).split("\\s");
+ target_words = fields.get(1).split("\\s");
+ feature_entries = fields.get(2).split("\\s");
+
+ } else {
+ if (fields.size() < 3)
+ continue;
+
+ lhs_word = "[X]";
+ String tmp = "[X,1] " + fields.get(0);
+ source_words = tmp.split("\\s");
+ tmp = "[X,1] " + fields.get(1);
+ target_words = tmp.split("\\s");
+ feature_entries = fields.get(2).split("\\s");
+ }
+
+ // Reached slice limit size, indicate that we're closing up.
+ if (!ready_to_flush
+ && (slice_counter > approximateMaximumSliceSize
+ || feature_buffer.overflowing()
+ || (packAlignments && alignment_buffer.overflowing()))) {
+ ready_to_flush = true;
+ // store the first two source words when slice size limit was reached
+ prev_first_two_source_words = getFirstTwoSourceWords(source_words);
+ }
+ // ready to flush
+ if (ready_to_flush) {
+ final String first_two_source_words = getFirstTwoSourceWords(source_words);
+ // the grammar can only be partitioned at the level of first two source word changes.
+ // Thus, we can only flush if the current first two source words differ from the ones
+ // when the slice size limit was reached.
+ if (!first_two_source_words.equals(prev_first_two_source_words)) {
+ logger.warning(String.format("ready to flush and first two words have changed (%s vs. %s)", prev_first_two_source_words, first_two_source_words));
+ logger.info(String.format("flushing %d rules to slice.", slice_counter));
+ flush(source_trie, target_trie, feature_buffer, alignment_buffer, num_slices);
+ source_trie.clear();
+ target_trie.clear();
+ feature_buffer.clear();
+ if (packAlignments)
+ alignment_buffer.clear();
+
+ num_slices++;
+ slice_counter = 0;
+ ready_to_flush = false;
+ }
+ }
+
+ int alignment_index = -1;
+ // If present, process alignments.
+ if (packAlignments) {
+ String alignment_line;
+ if (grammarAlignments) {
+ alignment_line = fields.get(3);
+ } else {
+ if (!alignment_reader.hasNext()) {
+ logger.severe("No more alignments starting in line " + counter);
+ throw new RuntimeException("No more alignments starting in line " + counter);
+ }
+ alignment_line = alignment_reader.next().trim();
+ }
+ String[] alignment_entries = alignment_line.split("\\s");
+ byte[] alignments = new byte[alignment_entries.length * 2];
+ if (alignment_entries.length != 0) {
+ for (int i = 0; i < alignment_entries.length; i++) {
+ String[] parts = alignment_entries[i].split("-");
+ alignments[2 * i] = Byte.parseByte(parts[0]);
+ alignments[2 * i + 1] = Byte.parseByte(parts[1]);
+ }
+ }
+ alignment_index = alignment_buffer.add(alignments);
+ }
+
+ // Process features.
+ // Implicitly sort via TreeMap, write to data buffer, remember position
+ // to pass on to the source trie node.
+ features.clear();
+ int feature_count = 0;
+ for (int f = 0; f < feature_entries.length; ++f) {
+ String feature_entry = feature_entries[f];
+ int feature_id;
+ float feature_value;
+ if (feature_entry.contains("=")) {
+ String[] parts = feature_entry.split("=");
+ if (parts[0].equals("Alignment"))
+ continue;
+ feature_id = Vocabulary.id(parts[0]);
+ feature_value = Float.parseFloat(parts[1]);
+ } else {
+ feature_id = Vocabulary.id(String.valueOf(feature_count++));
+ feature_value = Float.parseFloat(feature_entry);
+ }
+ if (feature_value != 0)
+ features.put(encoderConfig.innerId(feature_id), feature_value);
+ }
+ int features_index = feature_buffer.add(features);
+
+ // Sanity check on the data block index.
+ if (packAlignments && features_index != alignment_index) {
+ logger.severe("Block index mismatch between features (" + features_index
+ + ") and alignments (" + alignment_index + ").");
+ throw new RuntimeException("Data block index mismatch.");
+ }
+
+ // Process source side.
+ SourceValue sv = new SourceValue(Vocabulary.id(lhs_word), features_index);
+ int[] source = new int[source_words.length];
+ for (int i = 0; i < source_words.length; i++) {
+ if (FormatUtils.isNonterminal(source_words[i]))
+ source[i] = Vocabulary.id(FormatUtils.stripNonTerminalIndex(source_words[i]));
+ else
+ source[i] = Vocabulary.id(source_words[i]);
+ }
+ source_trie.add(source, sv);
+
+ // Process target side.
+ TargetValue tv = new TargetValue(sv);
+ int[] target = new int[target_words.length];
+ for (int i = 0; i < target_words.length; i++) {
+ if (FormatUtils.isNonterminal(target_words[i])) {
+ target[target_words.length - (i + 1)] = -FormatUtils.getNonterminalIndex(target_words[i]);
+ } else {
+ target[target_words.length - (i + 1)] = Vocabulary.id(target_words[i]);
+ }
+ }
+ target_trie.add(target, tv);
+ }
+ // flush last slice and clear buffers
+ flush(source_trie, target_trie, feature_buffer, alignment_buffer, num_slices);
+ }
+
+ /**
+ * Serializes the source, target and feature data structures into interlinked binary files. Target
+ * is written first, into a skeletal (node don't carry any data) upward-pointing trie, updating
+ * the linking source trie nodes with the position once it is known. Source and feature data are
+ * written simultaneously. The source structure is written into a downward-pointing trie and
+ * stores the rule's lhs as well as links to the target and feature stream. The feature stream is
+ * prompted to write out a block
+ *
+ * @param source_trie
+ * @param target_trie
+ * @param feature_buffer
+ * @param id
+ * @throws IOException
+ */
+ private void flush(PackingTrie<SourceValue> source_trie,
+ PackingTrie<TargetValue> target_trie, FeatureBuffer feature_buffer,
+ AlignmentBuffer alignment_buffer, int id) throws IOException {
+ // Make a slice object for this piece of the grammar.
+ PackingFileTuple slice = new PackingFileTuple("slice_" + String.format("%05d", id));
+ // Pull out the streams for source, target and data output.
+ DataOutputStream source_stream = slice.getSourceOutput();
+ DataOutputStream target_stream = slice.getTargetOutput();
+ DataOutputStream target_lookup_stream = slice.getTargetLookupOutput();
+ DataOutputStream feature_stream = slice.getFeatureOutput();
+ DataOutputStream alignment_stream = slice.getAlignmentOutput();
+
+ Queue<PackingTrie<TargetValue>> target_queue;
+ Queue<PackingTrie<SourceValue>> source_queue;
+
+ // The number of bytes both written into the source stream and
+ // buffered in the source queue.
+ int source_position;
+ // The number of bytes written into the target stream.
+ int target_position;
+
+ // Add trie root into queue, set target position to 0 and set cumulated
+ // size to size of trie root.
+ target_queue = new LinkedList<PackingTrie<TargetValue>>();
+ target_queue.add(target_trie);
+ target_position = 0;
+
+ // Target lookup table for trie levels.
+ int current_level_size = 1;
+ int next_level_size = 0;
+ ArrayList<Integer> target_lookup = new ArrayList<Integer>();
+
+ // Packing loop for upwards-pointing target trie.
+ while (!target_queue.isEmpty()) {
+ // Pop top of queue.
+ PackingTrie<TargetValue> node = target_queue.poll();
+ // Register that this is where we're writing the node to.
+ node.address = target_position;
+ // Tell source nodes that we're writing to this position in the file.
+ for (TargetValue tv : node.values)
+ tv.parent.target = node.address;
+ // Write link to parent.
+ if (node.parent != null)
+ target_stream.writeInt(node.parent.address);
+ else
+ target_stream.writeInt(-1);
+ target_stream.writeInt(node.symbol);
+ // Enqueue children.
+ for (int k : node.children.descendingKeySet()) {
+ PackingTrie<TargetValue> child = node.children.get(k);
+ target_queue.add(child);
+ }
+ target_position += node.size(false, true);
+ next_level_size += node.children.descendingKeySet().size();
+
+ current_level_size--;
+ if (current_level_size == 0) {
+ target_lookup.add(target_position);
+ current_level_size = next_level_size;
+ next_level_size = 0;
+ }
+ }
+ target_lookup_stream.writeInt(target_lookup.size());
+ for (int i : target_lookup)
+ target_lookup_stream.writeInt(i);
+ target_lookup_stream.close();
+
+ // Setting up for source and data writing.
+ source_queue = new LinkedList<PackingTrie<SourceValue>>();
+ source_queue.add(source_trie);
+ source_position = source_trie.size(true, false);
+ source_trie.address = target_position;
+
+ // Ready data buffers for writing.
+ feature_buffer.initialize();
+ if (packAlignments)
+ alignment_buffer.initialize();
+
+ // Packing loop for downwards-pointing source trie.
+ while (!source_queue.isEmpty()) {
+ // Pop top of queue.
+ PackingTrie<SourceValue> node = source_queue.poll();
+ // Write number of children.
+ source_stream.writeInt(node.children.size());
+ // Write links to children.
+ for (int k : node.children.descendingKeySet()) {
+ PackingTrie<SourceValue> child = node.children.get(k);
+ // Enqueue child.
+ source_queue.add(child);
+ // Child's address will be at the current end of the queue.
+ child.address = source_position;
+ // Advance cumulated size by child's size.
+ source_position += child.size(true, false);
+ // Write the link.
+ source_stream.writeInt(k);
+ source_stream.writeInt(child.address);
+ }
+ // Write number of data items.
+ source_stream.writeInt(node.values.size());
+ // Write lhs and links to target and data.
+ for (SourceValue sv : node.values) {
+ int feature_block_index = feature_buffer.write(sv.data);
+ if (packAlignments) {
+ int alignment_block_index = alignment_buffer.write(sv.data);
+ if (alignment_block_index != feature_block_index) {
+ logger.severe("Block index mismatch.");
+ throw new RuntimeException("Block index mismatch: alignment (" + alignment_block_index
+ + ") and features (" + feature_block_index + ") don't match.");
+ }
+ }
+ source_stream.writeInt(sv.lhs);
+ source_stream.writeInt(sv.target);
+ source_stream.writeInt(feature_block_index);
+ }
+ }
+ // Flush the data stream.
+ feature_buffer.flush(feature_stream);
+ if (packAlignments)
+ alignment_buffer.flush(alignment_stream);
+
+ target_stream.close();
+ source_stream.close();
+ feature_stream.close();
+ if (packAlignments)
+ alignment_stream.close();
+ }
+
+ public void writeVocabulary() throws IOException {
+ final String vocabularyFilename = output + File.separator + VOCABULARY_FILENAME;
+ logger.info("Writing vocabulary to " + vocabularyFilename);
+ Vocabulary.write(vocabularyFilename);
+ }
+
+ /**
+ * Integer-labeled, doubly-linked trie with some provisions for packing.
+ *
+ * @author Juri Ganitkevitch
+ *
+ * @param <D> The trie's value type.
+ */
+ class PackingTrie<D extends PackingTrieValue> {
+ int symbol;
+ PackingTrie<D> parent;
+
+ TreeMap<Integer, PackingTrie<D>> children;
+ List<D> values;
+
+ int address;
+
+ PackingTrie() {
+ address = -1;
+
+ symbol = 0;
+ parent = null;
+
+ children = new TreeMap<Integer, PackingTrie<D>>();
+ values = new ArrayList<D>();
+ }
+
+ PackingTrie(PackingTrie<D> parent, int symbol) {
+ this();
+ this.parent = parent;
+ this.symbol = symbol;
+ }
+
+ void add(int[] path, D value) {
+ add(path, 0, value);
+ }
+
+ private void add(int[] path, int index, D value) {
+ if (index == path.length)
+ this.values.add(value);
+ else {
+ PackingTrie<D> child = children.get(path[index]);
+ if (child == null) {
+ child = new PackingTrie<D>(this, path[index]);
+ children.put(path[index], child);
+ }
+ child.add(path, index + 1, value);
+ }
+ }
+
+ /**
+ * Calculate the size (in ints) of a packed trie node. Distinguishes downwards pointing (parent
+ * points to children) from upwards pointing (children point to parent) tries, as well as
+ * skeletal (no data, just the labeled links) and non-skeletal (nodes have a data block)
+ * packing.
+ *
+ * @param downwards Are we packing into a downwards-pointing trie?
+ * @param skeletal Are we packing into a skeletal trie?
+ *
+ * @return Number of bytes the trie node would occupy.
+ */
+ int size(boolean downwards, boolean skeletal) {
+ int size = 0;
+ if (downwards) {
+ // Number of children and links to children.
+ size = 1 + 2 * children.size();
+ } else {
+ // Link to parent.
+ size += 2;
+ }
+ // Non-skeletal packing: number of data items.
+ if (!skeletal)
+ size += 1;
+ // Non-skeletal packing: write size taken up by data items.
+ if (!skeletal && !values.isEmpty())
+ size += values.size() * values.get(0).size();
+
+ return size;
+ }
+
+ void clear() {
+ children.clear();
+ values.clear();
+ }
+ }
+
+ interface PackingTrieValue {
+ int size();
+ }
+
+ class SourceValue implements PackingTrieValue {
+ int lhs;
+ int data;
+ int target;
+
+ public SourceValue() {
+ }
+
+ SourceValue(int lhs, int data) {
+ this.lhs = lhs;
+ this.data = data;
+ }
+
+ void setTarget(int target) {
+ this.target = target;
+ }
+
+ public int size() {
+ return 3;
+ }
+ }
+
+ class TargetValue implements PackingTrieValue {
+ SourceValue parent;
+
+ TargetValue(SourceValue parent) {
+ this.parent = parent;
+ }
+
+ public int size() {
+ return 0;
+ }
+ }
+
+ abstract class PackingBuffer<T> {
+ private byte[] backing;
+ protected ByteBuffer buffer;
+
+ protected ArrayList<Integer> memoryLookup;
+ protected int totalSize;
+ protected ArrayList<Integer> onDiskOrder;
+
+ PackingBuffer() throws IOException {
+ allocate();
+ memoryLookup = new ArrayList<Integer>();
+ onDiskOrder = new ArrayList<Integer>();
+ totalSize = 0;
+ }
+
+ abstract int add(T item);
+
+ // Allocate a reasonably-sized buffer for the feature data.
+ private void allocate() {
+ backing = new byte[approximateMaximumSliceSize * DATA_SIZE_ESTIMATE];
+ buffer = ByteBuffer.wrap(backing);
+ }
+
+ // Reallocate the backing array and buffer, copies data over.
+ protected void reallocate() {
+ if (backing.length == Integer.MAX_VALUE)
+ return;
+ long attempted_length = backing.length * 2l;
+ int new_length;
+ // Detect overflow.
+ if (attempted_length >= Integer.MAX_VALUE)
+ new_length = Integer.MAX_VALUE;
+ else
+ new_length = (int) attempted_length;
+ byte[] new_backing = new byte[new_length];
+ System.arraycopy(backing, 0, new_backing, 0, backing.length);
+ int old_position = buffer.position();
+ ByteBuffer new_buffer = ByteBuffer.wrap(new_backing);
+ new_buffer.position(old_position);
+ buffer = new_buffer;
+ backing = new_backing;
+ }
+
+ /**
+ * Prepare the data buffer for disk writing.
+ */
+ void initialize() {
+ onDiskOrder.clear();
+ }
+
+ /**
+ * Enqueue a data block for later writing.
+ *
+ * @param block_index The index of the data block to add to writing queue.
+ * @return The to-be-written block's output index.
+ */
+ int write(int block_index) {
+ onDiskOrder.add(block_index);
+ return onDiskOrder.size() - 1;
+ }
+
+ /**
+ * Performs the actual writing to disk in the order specified by calls to write() since the last
+ * call to initialize().
+ *
+ * @param out
+ * @throws IOException
+ */
+ void flush(DataOutputStream out) throws IOException {
+ writeHeader(out);
+ int size;
+ int block_address;
+ for (int block_index : onDiskOrder) {
+ block_address = memoryLookup.get(block_index);
+ size = blockSize(block_index);
+ out.write(backing, block_address, size);
+ }
+ }
+
+ void clear() {
+ buffer.clear();
+ memoryLookup.clear();
+ onDiskOrder.clear();
+ }
+
+ boolean overflowing() {
+ return (buffer.position() >= DATA_SIZE_LIMIT);
+ }
+
+ private void writeHeader(DataOutputStream out) throws IOException {
+ if (out.size() == 0) {
+ out.writeInt(onDiskOrder.size());
+ out.writeInt(totalSize);
+ int disk_position = headerSize();
+ for (int block_index : onDiskOrder) {
+ out.writeInt(disk_position);
+ disk_position += blockSize(block_index);
+ }
+ } else {
+ throw new RuntimeException("Got a used stream for header writing.");
+ }
+ }
+
+ private int headerSize() {
+ // One integer for each data block, plus number of blocks and total size.
+ return 4 * (onDiskOrder.size() + 2);
+ }
+
+ private int blockSize(int block_index) {
+ int block_address = memoryLookup.get(block_index);
+ return (block_index < memoryLookup.size() - 1 ? memoryLookup.get(block_index + 1) : totalSize)
+ - block_address;
+ }
+ }
+
+ class FeatureBuffer extends PackingBuffer<TreeMap<Integer, Float>> {
+
+ private IntEncoder idEncoder;
+
+ FeatureBuffer() throws IOException {
+ super();
+ idEncoder = types.getIdEncoder();
+ logger.info("Encoding feature ids in: " + idEncoder.getKey());
+ }
+
+ /**
+ * Add a block of features to the buffer.
+ *
+ * @param features TreeMap with the features for one rule.
+ * @return The index of the resulting data block.
+ */
+ int add(TreeMap<Integer, Float> features) {
+ int data_position = buffer.position();
+
+ // Over-estimate how much room this addition will need: for each
+ // feature (ID_SIZE for label, "upper bound" of 4 for the value), plus ID_SIZE for
+ // the number of features. If this won't fit, reallocate the buffer.
+ int size_estimate = (4 + EncoderConfiguration.ID_SIZE) * features.size()
+ + EncoderConfiguration.ID_SIZE;
+ if (buffer.capacity() - buffer.position() <= size_estimate)
+ reallocate();
+
+ // Write features to buffer.
+ idEncoder.write(buffer, features.size());
+ for (Integer k : features.descendingKeySet()) {
+ float v = features.get(k);
+ // Sparse features.
+ if (v != 0.0) {
+ idEncoder.write(buffer, k);
+ encoderConfig.encoder(k).write(buffer, v);
+ }
+ }
+ // Store position the block was written to.
+ memoryLookup.add(data_position);
+ // Update total size (in bytes).
+ totalSize = buffer.position();
+
+ // Return block index.
+ return memoryLookup.size() - 1;
+ }
+ }
+
+ class AlignmentBuffer extends PackingBuffer<byte[]> {
+
+ AlignmentBuffer() throws IOException {
+ super();
+ }
+
+ /**
+ * Add a rule alignments to the buffer.
+ *
+ * @param alignments a byte array with the alignment points for one rule.
+ * @return The index of the resulting data block.
+ */
+ int add(byte[] alignments) {
+ int data_position = buffer.position();
+ int size_estimate = alignments.length + 1;
+ if (buffer.capacity() - buffer.position() <= size_estimate)
+ reallocate();
+
+ // Write alignment points to buffer.
+ buffer.put((byte) (alignments.length / 2));
+ buffer.put(alignments);
+
+ // Store position the block was written to.
+ memoryLookup.add(data_position);
+ // Update total size (in bytes).
+ totalSize = buffer.position();
+ // Return block index.
+ return memoryLookup.size() - 1;
+ }
+ }
+
+ class PackingFileTuple implements Comparable<PackingFileTuple> {
+ private File sourceFile;
+ private File targetLookupFile;
+ private File targetFile;
+
+ private File featureFile;
+ private File alignmentFile;
+
+ PackingFileTuple(String prefix) {
+ sourceFile = new File(output + File.separator + prefix + ".source");
+ targetFile = new File(output + File.separator + prefix + ".target");
+ targetLookupFile = new File(output + File.separator + prefix + ".target.lookup");
+ featureFile = new File(output + File.separator + prefix + ".features");
+
+ alignmentFile = null;
+ if (packAlignments)
+ alignmentFile = new File(output + File.separator + prefix + ".alignments");
+
+ logger.info("Allocated slice: " + sourceFile.getAbsolutePath());
+ }
+
+ DataOutputStream getSourceOutput() throws IOException {
+ return getOutput(sourceFile);
+ }
+
+ DataOutputStream getTargetOutput() throws IOException {
+ return getOutput(targetFile);
+ }
+
+ DataOutputStream getTargetLookupOutput() throws IOException {
+ return getOutput(targetLookupFile);
+ }
+
+ DataOutputStream getFeatureOutput() throws IOException {
+ return getOutput(featureFile);
+ }
+
+ DataOutputStream getAlignmentOutput() throws IOException {
+ if (alignmentFile != null)
+ return getOutput(alignmentFile);
+ return null;
+ }
+
+ private DataOutputStream getOutput(File file) throws IOException {
+ if (file.createNewFile()) {
+ return new DataOutputStream(new BufferedOutputStream(new FileOutputStream(file)));
+ } else {
+ throw new RuntimeException("File doesn't exist: " + file.getName());
+ }
+ }
+
+ long getSize() {
+ return sourceFile.length() + targetFile.length() + featureFile.length();
+ }
+
+ @Override
+ public int compareTo(PackingFileTuple o) {
+ if (getSize() > o.getSize()) {
+ return -1;
+ } else if (getSize() < o.getSize()) {
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+ }
+}