You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/08/23 22:17:45 UTC
[28/50] [abbrv] incubator-joshua git commit: Merge branch 'master'
into 7
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc756709/joshua-core/src/main/java/org/apache/joshua/server/ServerThread.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/server/ServerThread.java
index 6cfea6c,0000000..e9f9c62
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/server/ServerThread.java
+++ b/joshua-core/src/main/java/org/apache/joshua/server/ServerThread.java
@@@ -1,297 -1,0 +1,300 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.server;
+
+import static org.apache.joshua.decoder.ff.FeatureMap.hashFeature;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.io.StringReader;
+import java.io.UnsupportedEncodingException;
+import java.net.Socket;
+import java.net.SocketException;
+import java.net.URLDecoder;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+
+import org.apache.joshua.decoder.Decoder;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.Translation;
+import org.apache.joshua.decoder.Translations;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.ff.tm.Trie;
+import org.apache.joshua.decoder.ff.tm.format.HieroFormatReader;
+import org.apache.joshua.decoder.io.JSONMessage;
+import org.apache.joshua.decoder.io.TranslationRequestStream;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.sun.net.httpserver.HttpExchange;
+import com.sun.net.httpserver.HttpHandler;
+
+/**
+ * This class handles a concurrent request for translations from a newly opened socket, for
+ * both raw TCP/IP connections and for HTTP connections.
+ *
+ */
+public class ServerThread extends Thread implements HttpHandler {
+
+ private static final Logger LOG = LoggerFactory.getLogger(ServerThread.class);
+ private static final Charset FILE_ENCODING = Charset.forName("UTF-8");
+
+ private final JoshuaConfiguration joshuaConfiguration;
+ private Socket socket = null;
+ private final Decoder decoder;
+
+ /**
+ * Creates a new TcpServerThread that can run a set of translations.
+ *
+ * @param socket the socket representing the input/output streams
+ * @param decoder the configured decoder that handles performing translations
+ * @param joshuaConfiguration a populated {@link org.apache.joshua.decoder.JoshuaConfiguration}
+ */
+ public ServerThread(Socket socket, Decoder decoder, JoshuaConfiguration joshuaConfiguration) {
+ this.joshuaConfiguration = joshuaConfiguration;
+ this.socket = socket;
+ this.decoder = decoder;
+ }
+
+ /**
+ * Reads the input from the socket, submits the input to the decoder, transforms the resulting
+ * translations into the required output format, writes out the formatted output, then closes the
+ * socket.
+ */
+ @Override
+ public void run() {
+
+ //TODO: use try-with-resources block
+ try {
+ BufferedReader reader = new BufferedReader(new InputStreamReader(socket.getInputStream(), FILE_ENCODING));
+
+ TranslationRequestStream request = new TranslationRequestStream(reader, joshuaConfiguration);
+
+ try {
+ Translations translations = decoder.decodeAll(request);
+
+ OutputStream out = socket.getOutputStream();
+
+ for (Translation translation: translations) {
+ out.write(translation.toString().getBytes());
+ }
+
+ } catch (SocketException e) {
+ LOG.error(" Socket interrupted", e);
+ request.shutdown();
+ } finally {
+ reader.close();
+ socket.close();
+ }
+ } catch (IOException e) {
+ LOG.error(e.getMessage(), e);
+ }
+ }
+
+ public HashMap<String, String> queryToMap(String query) throws UnsupportedEncodingException {
+ HashMap<String, String> result = new HashMap<String, String>();
+ for (String param : query.split("&")) {
+ String pair[] = param.split("=");
+ if (pair.length > 1) {
+ result.put(pair[0], URLDecoder.decode(pair[1], "UTF-8"));
+ } else {
+ result.put(pair[0], "");
+ }
+ }
+ return result;
+ }
+
+ private class HttpWriter extends OutputStream {
+
+ private HttpExchange client = null;
+ private OutputStream out = null;
+
+ public HttpWriter(HttpExchange client) {
+ this.client = client;
+ client.getResponseHeaders().add("Access-Control-Allow-Origin", "*");
+ }
+
+ @Override
+ public void write(byte[] response) throws IOException {
+ client.sendResponseHeaders(200, response.length);
+ out = client.getResponseBody();
+ out.write(response);
+ out.close();
+ }
+
+ @Override
+ public void write(int b) throws IOException {
+ out.write(b);
+ }
+ }
+
+ /**
+ * Called to handle an HTTP connection. This looks for metadata in the URL string, which is processed
+ * if present. It also then handles returning a JSON-formatted object to the caller.
+ *
+ * @param client the client connection
+ */
+ @Override
+ public synchronized void handle(HttpExchange client) throws IOException {
+
+ HashMap<String, String> params = queryToMap(client.getRequestURI().getQuery());
+ String query = params.get("q");
+ String meta = params.get("meta");
+
+ BufferedReader reader = new BufferedReader(new StringReader(query));
+ TranslationRequestStream request = new TranslationRequestStream(reader, joshuaConfiguration);
+
+ Translations translations = decoder.decodeAll(request);
+ JSONMessage message = new JSONMessage();
+ if (meta != null && ! meta.isEmpty())
+ handleMetadata(meta, message);
+
+ for (Translation translation: translations) {
+ LOG.info("TRANSLATION: '{}' with {} k-best items", translation, translation.getStructuredTranslations().size());
+ message.addTranslation(translation);
+ }
+
+ OutputStream out = new HttpWriter(client);
+ out.write(message.toString().getBytes());
+ if (LOG.isDebugEnabled())
+ LOG.debug(message.toString());
+ out.close();
+
+ reader.close();
+ }
+
+ /**
+ * Processes metadata commands received in the HTTP request. Some commands result in sending data back.
+ *
+ * @param meta the metadata request
+ * @return result string (for some commands)
+ */
+ private void handleMetadata(String meta, JSONMessage message) {
+ String[] tokens = meta.split("\\s+", 2);
+ String type = tokens[0];
+ String args = tokens.length > 1 ? tokens[1] : "";
+
+ if (type.equals("get_weight")) {
+ String weight = tokens[1];
+ LOG.info("WEIGHT: %s = %.3f", weight, Decoder.weights.getOrDefault(hashFeature(weight)));
+
+ } else if (type.equals("set_weights")) {
+ // Change a decoder weight
+ String[] argTokens = args.split("\\s+");
+ for (int i = 0; i < argTokens.length; i += 2) {
+ String feature = argTokens[i];
+ int featureId = hashFeature(feature);
+ String newValue = argTokens[i+1];
+ float old_weight = Decoder.weights.getOrDefault(featureId);
+ Decoder.weights.put(featureId, Float.parseFloat(newValue));
+ LOG.info("set_weights: {} {} -> {}", feature, old_weight, Decoder.weights.getOrDefault(featureId));
+ }
+
+ message.addMetaData("weights " + Decoder.weights.toString());
+
+ } else if (type.equals("get_weights")) {
+ message.addMetaData("weights " + Decoder.weights.toString());
+
+ } else if (type.equals("add_rule")) {
+ String argTokens[] = args.split(" \\|\\|\\| ");
+
+ if (argTokens.length < 3) {
+ LOG.error("* INVALID RULE '{}'", meta);
+ return;
+ }
+
+ String lhs = argTokens[0];
+ String source = argTokens[1];
+ String target = argTokens[2];
+ String featureStr = "";
++ String alignmentStr = "";
+ if (argTokens.length > 3)
+ featureStr = argTokens[3];
-
++ if (argTokens.length > 4)
++ alignmentStr = " ||| " + argTokens[4];
++
+ /* Prepend source and target side nonterminals for phrase-based decoding. Probably better
+ * handled in each grammar type's addRule() function.
+ */
+ String ruleString = (joshuaConfiguration.search_algorithm.equals("stack"))
- ? String.format("%s ||| [X,1] %s ||| [X,1] %s ||| custom=1 %s", lhs, source, target, featureStr)
- : String.format("%s ||| %s ||| %s ||| custom=1 %s", lhs, source, target, featureStr);
++ ? String.format("%s ||| [X,1] %s ||| [X,1] %s ||| -1 %s %s", lhs, source, target, featureStr, alignmentStr)
++ : String.format("%s ||| %s ||| %s ||| -1 %s %s", lhs, source, target, featureStr, alignmentStr);
+
+ Rule rule = new HieroFormatReader(decoder.getCustomPhraseTable().getOwner()).parseLine(ruleString);
+ decoder.addCustomRule(rule);
+
+ LOG.info("Added custom rule {}", rule.toString());
+
+ } else if (type.equals("list_rules")) {
+
+ LOG.info("list_rules");
+
+ // Walk the the grammar trie
+ ArrayList<Trie> nodes = new ArrayList<Trie>();
+ nodes.add(decoder.getCustomPhraseTable().getTrieRoot());
+
+ while (nodes.size() > 0) {
+ Trie trie = nodes.remove(0);
+
+ if (trie == null)
+ continue;
+
+ if (trie.hasRules()) {
+ for (Rule rule: trie.getRuleCollection().getRules()) {
+ message.addRule(rule.toString());
+ LOG.debug("Found rule: " + rule);
+ }
+ }
+
+ if (trie.getExtensions() != null)
+ nodes.addAll(trie.getExtensions());
+ }
+
+ } else if (type.equals("remove_rule")) {
+
+ Rule rule = new HieroFormatReader(decoder.getCustomPhraseTable().getOwner()).parseLine(args);
+
+ LOG.info("remove_rule " + rule);
+
+ Trie trie = decoder.getCustomPhraseTable().getTrieRoot();
+ int[] sourceTokens = rule.getSource();
+ for (int i = 0; i < sourceTokens.length; i++) {
+ Trie nextTrie = trie.match(sourceTokens[i]);
+ if (nextTrie == null)
+ return;
+
+ trie = nextTrie;
+ }
+
+ if (trie.hasRules()) {
+ for (Rule ruleCand: trie.getRuleCollection().getRules()) {
+ if (Arrays.equals(rule.getTarget(), ruleCand.getTarget())) {
+ trie.getRuleCollection().getRules().remove(ruleCand);
+ break;
+ }
+ }
+ return;
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc756709/joshua-core/src/main/java/org/apache/joshua/tools/GrammarPacker.java
----------------------------------------------------------------------
diff --cc joshua-core/src/main/java/org/apache/joshua/tools/GrammarPacker.java
index 6c02d19,0000000..5861052
mode 100644,000000..100644
--- a/joshua-core/src/main/java/org/apache/joshua/tools/GrammarPacker.java
+++ b/joshua-core/src/main/java/org/apache/joshua/tools/GrammarPacker.java
@@@ -1,932 -1,0 +1,936 @@@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.tools;
+
+import static org.apache.joshua.decoder.ff.tm.OwnerMap.UNKNOWN_OWNER_ID;
+import static org.apache.joshua.decoder.ff.tm.packed.PackedGrammar.VOCABULARY_FILENAME;
+
+import java.io.BufferedOutputStream;
+import java.io.DataOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.nio.ByteBuffer;
+import java.util.ArrayList;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map.Entry;
+import java.util.Queue;
+import java.util.TreeMap;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.ff.tm.RuleFactory;
+import org.apache.joshua.decoder.ff.tm.format.HieroFormatReader;
+import org.apache.joshua.decoder.ff.tm.format.MosesFormatReader;
+import org.apache.joshua.util.FormatUtils;
+import org.apache.joshua.util.encoding.EncoderConfiguration;
+import org.apache.joshua.util.encoding.FeatureTypeAnalyzer;
+import org.apache.joshua.util.encoding.IntEncoder;
+import org.apache.joshua.util.io.LineReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+public class GrammarPacker {
+
+ private static final Logger LOG = LoggerFactory.getLogger(GrammarPacker.class);
+
+ /**
+ * The packed grammar version number. Increment this any time you add new features, and update
+ * the documentation.
+ *
+ * Version history:
+ *
+ * - 3 (May 2016). This was the first version that was marked. It removed the special phrase-
+ * table packing that packed phrases without the [X,1] on the source and target sides, which
+ * then required special handling in the decoder to use for phrase-based decoding.
+ *
- *
++ * - 4 (August 2016). Phrase-based decoding rewritten to represent phrases without a builtin
++ * nonterminal. Instead, cost-less glue rules are used in phrase-based decoding. This eliminates
++ * the need for special handling of phrase grammars (except for having to add a LHS), and lets
++ * phrase grammars be used in both hierarchical and phrase-based decoding without conversion.
++ *
+ */
- public static final int VERSION = 3;
++ public static final int VERSION = 4;
+
+ // Size limit for slice in bytes.
+ private static int DATA_SIZE_LIMIT = (int) (Integer.MAX_VALUE * 0.8);
+ // Estimated average number of feature entries for one rule.
+ private static int DATA_SIZE_ESTIMATE = 20;
+
+ private static final String SOURCE_WORDS_SEPARATOR = " ||| ";
+
+ // Output directory name.
+ private String output;
+
+ // Input grammar to be packed.
+ private String grammar;
+
+ public String getGrammar() {
+ return grammar;
+ }
+
+ public String getOutputDirectory() {
+ return output;
+ }
+
+ // Approximate maximum size of a slice in number of rules
+ private int approximateMaximumSliceSize;
+
+ private boolean labeled;
+
+ private boolean packAlignments;
+ private boolean grammarAlignments;
+ private String alignments;
+
+ private FeatureTypeAnalyzer types;
+ private EncoderConfiguration encoderConfig;
+
+ private String dump;
+
+ private int max_source_len;
+
+ public GrammarPacker(String grammar_filename, String config_filename, String output_filename,
+ String alignments_filename, String featuredump_filename, boolean grammar_alignments,
+ int approximateMaximumSliceSize)
+ throws IOException {
+ this.labeled = true;
+ this.grammar = grammar_filename;
+ this.output = output_filename;
+ this.dump = featuredump_filename;
+ this.grammarAlignments = grammar_alignments;
+ this.approximateMaximumSliceSize = approximateMaximumSliceSize;
+ this.max_source_len = 0;
+
+ // TODO: Always open encoder config? This is debatable.
+ this.types = new FeatureTypeAnalyzer(true);
+
+ this.alignments = alignments_filename;
+ packAlignments = grammarAlignments || (alignments != null);
+ if (!packAlignments) {
+ LOG.info("No alignments file or grammar specified, skipping.");
+ } else if (alignments != null && !new File(alignments_filename).exists()) {
+ throw new RuntimeException("Alignments file does not exist: " + alignments);
+ }
+
+ if (config_filename != null) {
+ readConfig(config_filename);
+ types.readConfig(config_filename);
+ } else {
+ LOG.info("No config specified. Attempting auto-detection of feature types.");
+ }
+ LOG.info("Approximate maximum slice size (in # of rules) set to {}", approximateMaximumSliceSize);
+
+ File working_dir = new File(output);
+ working_dir.mkdir();
+ if (!working_dir.exists()) {
+ throw new RuntimeException("Failed creating output directory.");
+ }
+ }
+
+ private void readConfig(String config_filename) throws IOException {
+ LineReader reader = new LineReader(config_filename);
+ while (reader.hasNext()) {
+ // Clean up line, chop comments off and skip if the result is empty.
+ String line = reader.next().trim();
+ if (line.indexOf('#') != -1)
+ line = line.substring(0, line.indexOf('#'));
+ if (line.isEmpty())
+ continue;
+ String[] fields = line.split("[\\s]+");
+
+ if (fields.length < 2) {
+ throw new RuntimeException("Incomplete line in config.");
+ }
+ if ("slice_size".equals(fields[0])) {
+ // Number of records to concurrently load into memory for sorting.
+ approximateMaximumSliceSize = Integer.parseInt(fields[1]);
+ }
+ }
+ reader.close();
+ }
+
+ /**
+ * Executes the packing.
+ *
+ * @throws IOException if there is an error reading the grammar
+ */
+ public void pack() throws IOException {
+ LOG.info("Beginning exploration pass.");
+
+ // Explore pass. Learn vocabulary and feature value histograms.
+ LOG.info("Exploring: {}", grammar);
+
+ HieroFormatReader grammarReader = getGrammarReader();
+ explore(grammarReader);
+
+ LOG.info("Exploration pass complete. Freezing vocabulary and finalizing encoders.");
+ if (dump != null) {
+ PrintWriter dump_writer = new PrintWriter(dump);
+ dump_writer.println(types.toString());
+ dump_writer.close();
+ }
+
+ types.inferTypes(this.labeled);
+ LOG.info("Type inference complete.");
+
+ LOG.info("Finalizing encoding.");
+
+ LOG.info("Writing encoding.");
+ types.write(output + File.separator + "encoding");
+
+ writeVocabulary();
+
+ String configFile = output + File.separator + "config";
+ LOG.info("Writing config to '{}'", configFile);
+ // Write config options
+ FileWriter config = new FileWriter(configFile);
+ config.write(String.format("version = %d\n", VERSION));
+ config.write(String.format("max-source-len = %d\n", max_source_len));
+ config.close();
+
+ // Read previously written encoder configuration to match up to changed
+ // vocabulary id's.
+ LOG.info("Reading encoding.");
+ encoderConfig = new EncoderConfiguration();
+ encoderConfig.load(output + File.separator + "encoding");
+
+ LOG.info("Beginning packing pass.");
+ // Actual binarization pass. Slice and pack source, target and data.
+ grammarReader = getGrammarReader();
+ LineReader alignment_reader = null;
+ if (packAlignments && !grammarAlignments)
+ alignment_reader = new LineReader(alignments);
+ binarize(grammarReader, alignment_reader);
+ LOG.info("Packing complete.");
+
+ LOG.info("Packed grammar in: {}", output);
+ LOG.info("Done.");
+ }
+
+ /**
+ * Returns a reader that turns whatever file format is found into unowned Hiero grammar rules.
+ * This means, features are NOT prepended with an owner string at packing time.
+ *
+ * @param grammarFile
+ * @return GrammarReader of correct Format
+ * @throws IOException
+ */
+ private HieroFormatReader getGrammarReader() throws IOException {
+ LineReader reader = new LineReader(grammar);
+ String line = reader.next();
+ if (line.startsWith("[")) {
+ return new HieroFormatReader(grammar, UNKNOWN_OWNER_ID);
+ } else {
+ return new MosesFormatReader(grammar, UNKNOWN_OWNER_ID);
+ }
+ }
+
+ /**
+ * This first pass over the grammar
+ * @param reader
+ */
+ private void explore(HieroFormatReader reader) {
+
+ // We always assume a labeled grammar. Unlabeled features are assumed to be dense and to always
+ // appear in the same order. They are assigned numeric names in order of appearance.
+ this.types.setLabeled(true);
+
+ for (Rule rule : reader) {
+
+ max_source_len = Math.max(max_source_len, rule.getSource().length);
+
+ /* Add symbols to vocabulary.
+ * NOTE: In case of nonterminals, we add both stripped versions ("[X]")
+ * and "[X,1]" to the vocabulary.
+ *
+ * TODO: MJP May 2016: Is it necessary to add [X,1]? This is currently being done in
+ * {@link HieroFormatReader}, which is called by {@link MosesFormatReader}.
+ */
+
+ // pass the value through the appropriate encoder.
+ for (final Entry<Integer, Float> entry : rule.getFeatureVector().entrySet()) {
+ types.observe(entry.getKey(), entry.getValue());
+ }
+ }
+ }
+
+ /**
+ * Returns a String encoding the first two source words.
+ * If there is only one source word, use empty string for the second.
+ */
+ private String getFirstTwoSourceWords(final String[] source_words) {
+ return source_words[0] + SOURCE_WORDS_SEPARATOR + ((source_words.length > 1) ? source_words[1] : "");
+ }
+
+ private void binarize(HieroFormatReader grammarReader, LineReader alignment_reader) throws IOException {
+ int counter = 0;
+ int slice_counter = 0;
+ int num_slices = 0;
+
+ boolean ready_to_flush = false;
+ // to determine when flushing is possible
+ String prev_first_two_source_words = null;
+
+ PackingTrie<SourceValue> source_trie = new PackingTrie<SourceValue>();
+ PackingTrie<TargetValue> target_trie = new PackingTrie<TargetValue>();
+ FeatureBuffer feature_buffer = new FeatureBuffer();
+
+ AlignmentBuffer alignment_buffer = null;
+ if (packAlignments)
+ alignment_buffer = new AlignmentBuffer();
+
+ TreeMap<Integer, Float> features = new TreeMap<Integer, Float>();
+ for (Rule rule : grammarReader) {
+ counter++;
+ slice_counter++;
+
+ String lhs_word = Vocabulary.word(rule.getLHS());
+ String[] source_words = rule.getSourceWords().split("\\s+");
+ String[] target_words = rule.getTargetWords().split("\\s+");
+
+ // Reached slice limit size, indicate that we're closing up.
+ if (!ready_to_flush
+ && (slice_counter > approximateMaximumSliceSize
+ || feature_buffer.overflowing()
+ || (packAlignments && alignment_buffer.overflowing()))) {
+ ready_to_flush = true;
+ // store the first two source words when slice size limit was reached
+ prev_first_two_source_words = getFirstTwoSourceWords(source_words);
+ }
+ // ready to flush
+ if (ready_to_flush) {
+ final String first_two_source_words = getFirstTwoSourceWords(source_words);
+ // the grammar can only be partitioned at the level of first two source word changes.
+ // Thus, we can only flush if the current first two source words differ from the ones
+ // when the slice size limit was reached.
+ if (!first_two_source_words.equals(prev_first_two_source_words)) {
+ LOG.warn("ready to flush and first two words have changed ({} vs. {})",
+ prev_first_two_source_words, first_two_source_words);
+ LOG.info("flushing {} rules to slice.", slice_counter);
+ flush(source_trie, target_trie, feature_buffer, alignment_buffer, num_slices);
+ source_trie.clear();
+ target_trie.clear();
+ feature_buffer.clear();
+ if (packAlignments)
+ alignment_buffer.clear();
+
+ num_slices++;
+ slice_counter = 0;
+ ready_to_flush = false;
+ }
+ }
+
+ int alignment_index = -1;
+ // If present, process alignments.
+ if (packAlignments) {
+ byte[] alignments = null;
+ if (grammarAlignments) {
+ alignments = rule.getAlignment();
+ } else {
+ if (!alignment_reader.hasNext()) {
+ LOG.error("No more alignments starting in line {}", counter);
+ throw new RuntimeException("No more alignments starting in line " + counter);
+ }
+ alignments = RuleFactory.parseAlignmentString(alignment_reader.next().trim());
+ }
+ alignment_index = alignment_buffer.add(alignments);
+ }
+
+ // Process features.
+ // Implicitly sort via TreeMap, write to data buffer, remember position
+ // to pass on to the source trie node.
+ features.clear();
+ for (Entry<Integer, Float> entry : rule.getFeatureVector().entrySet()) {
+ int featureId = entry.getKey();
+ float featureValue = entry.getValue();
+ if (featureValue != 0f) {
+ features.put(encoderConfig.innerId(featureId), featureValue);
+ }
+ }
+
+ int features_index = feature_buffer.add(features);
+
+ // Sanity check on the data block index.
+ if (packAlignments && features_index != alignment_index) {
+ LOG.error("Block index mismatch between features ({}) and alignments ({}).",
+ features_index, alignment_index);
+ throw new RuntimeException("Data block index mismatch.");
+ }
+
+ // Process source side.
+ SourceValue sv = new SourceValue(Vocabulary.id(lhs_word), features_index);
+ int[] source = new int[source_words.length];
+ for (int i = 0; i < source_words.length; i++) {
+ if (FormatUtils.isNonterminal(source_words[i]))
+ source[i] = Vocabulary.id(FormatUtils.stripNonTerminalIndex(source_words[i]));
+ else
+ source[i] = Vocabulary.id(source_words[i]);
+ }
+ source_trie.add(source, sv);
+
+ // Process target side.
+ TargetValue tv = new TargetValue(sv);
+ int[] target = new int[target_words.length];
+ for (int i = 0; i < target_words.length; i++) {
+ if (FormatUtils.isNonterminal(target_words[i])) {
+ target[target_words.length - (i + 1)] = -FormatUtils.getNonterminalIndex(target_words[i]);
+ } else {
+ target[target_words.length - (i + 1)] = Vocabulary.id(target_words[i]);
+ }
+ }
+ target_trie.add(target, tv);
+ }
+ // flush last slice and clear buffers
+ flush(source_trie, target_trie, feature_buffer, alignment_buffer, num_slices);
+ }
+
+ /**
+ * Serializes the source, target and feature data structures into interlinked binary files. Target
+ * is written first, into a skeletal (node don't carry any data) upward-pointing trie, updating
+ * the linking source trie nodes with the position once it is known. Source and feature data are
+ * written simultaneously. The source structure is written into a downward-pointing trie and
+ * stores the rule's lhs as well as links to the target and feature stream. The feature stream is
+ * prompted to write out a block
+ *
+ * @param source_trie
+ * @param target_trie
+ * @param feature_buffer
+ * @param id
+ * @throws IOException
+ */
+ private void flush(PackingTrie<SourceValue> source_trie,
+ PackingTrie<TargetValue> target_trie, FeatureBuffer feature_buffer,
+ AlignmentBuffer alignment_buffer, int id) throws IOException {
+ // Make a slice object for this piece of the grammar.
+ PackingFileTuple slice = new PackingFileTuple("slice_" + String.format("%05d", id));
+ // Pull out the streams for source, target and data output.
+ DataOutputStream source_stream = slice.getSourceOutput();
+ DataOutputStream target_stream = slice.getTargetOutput();
+ DataOutputStream target_lookup_stream = slice.getTargetLookupOutput();
+ DataOutputStream feature_stream = slice.getFeatureOutput();
+ DataOutputStream alignment_stream = slice.getAlignmentOutput();
+
+ Queue<PackingTrie<TargetValue>> target_queue;
+ Queue<PackingTrie<SourceValue>> source_queue;
+
+ // The number of bytes both written into the source stream and
+ // buffered in the source queue.
+ int source_position;
+ // The number of bytes written into the target stream.
+ int target_position;
+
+ // Add trie root into queue, set target position to 0 and set cumulated
+ // size to size of trie root.
+ target_queue = new LinkedList<PackingTrie<TargetValue>>();
+ target_queue.add(target_trie);
+ target_position = 0;
+
+ // Target lookup table for trie levels.
+ int current_level_size = 1;
+ int next_level_size = 0;
+ ArrayList<Integer> target_lookup = new ArrayList<Integer>();
+
+ // Packing loop for upwards-pointing target trie.
+ while (!target_queue.isEmpty()) {
+ // Pop top of queue.
+ PackingTrie<TargetValue> node = target_queue.poll();
+ // Register that this is where we're writing the node to.
+ node.address = target_position;
+ // Tell source nodes that we're writing to this position in the file.
+ for (TargetValue tv : node.values)
+ tv.parent.target = node.address;
+ // Write link to parent.
+ if (node.parent != null)
+ target_stream.writeInt(node.parent.address);
+ else
+ target_stream.writeInt(-1);
+ target_stream.writeInt(node.symbol);
+ // Enqueue children.
+ for (int k : node.children.descendingKeySet()) {
+ PackingTrie<TargetValue> child = node.children.get(k);
+ target_queue.add(child);
+ }
+ target_position += node.size(false, true);
+ next_level_size += node.children.descendingKeySet().size();
+
+ current_level_size--;
+ if (current_level_size == 0) {
+ target_lookup.add(target_position);
+ current_level_size = next_level_size;
+ next_level_size = 0;
+ }
+ }
+ target_lookup_stream.writeInt(target_lookup.size());
+ for (int i : target_lookup)
+ target_lookup_stream.writeInt(i);
+ target_lookup_stream.close();
+
+ // Setting up for source and data writing.
+ source_queue = new LinkedList<PackingTrie<SourceValue>>();
+ source_queue.add(source_trie);
+ source_position = source_trie.size(true, false);
+ source_trie.address = target_position;
+
+ // Ready data buffers for writing.
+ feature_buffer.initialize();
+ if (packAlignments)
+ alignment_buffer.initialize();
+
+ // Packing loop for downwards-pointing source trie.
+ while (!source_queue.isEmpty()) {
+ // Pop top of queue.
+ PackingTrie<SourceValue> node = source_queue.poll();
+ // Write number of children.
+ source_stream.writeInt(node.children.size());
+ // Write links to children.
+ for (int k : node.children.descendingKeySet()) {
+ PackingTrie<SourceValue> child = node.children.get(k);
+ // Enqueue child.
+ source_queue.add(child);
+ // Child's address will be at the current end of the queue.
+ child.address = source_position;
+ // Advance cumulated size by child's size.
+ source_position += child.size(true, false);
+ // Write the link.
+ source_stream.writeInt(k);
+ source_stream.writeInt(child.address);
+ }
+ // Write number of data items.
+ source_stream.writeInt(node.values.size());
+ // Write lhs and links to target and data.
+ for (SourceValue sv : node.values) {
+ int feature_block_index = feature_buffer.write(sv.data);
+ if (packAlignments) {
+ int alignment_block_index = alignment_buffer.write(sv.data);
+ if (alignment_block_index != feature_block_index) {
+ LOG.error("Block index mismatch.");
+ throw new RuntimeException("Block index mismatch: alignment (" + alignment_block_index
+ + ") and features (" + feature_block_index + ") don't match.");
+ }
+ }
+ source_stream.writeInt(sv.lhs);
+ source_stream.writeInt(sv.target);
+ source_stream.writeInt(feature_block_index);
+ }
+ }
+ // Flush the data stream.
+ feature_buffer.flush(feature_stream);
+ if (packAlignments)
+ alignment_buffer.flush(alignment_stream);
+
+ target_stream.close();
+ source_stream.close();
+ feature_stream.close();
+ if (packAlignments)
+ alignment_stream.close();
+ }
+
+ public void writeVocabulary() throws IOException {
+ final String vocabularyFilename = output + File.separator + VOCABULARY_FILENAME;
+ LOG.info("Writing vocabulary to {}", vocabularyFilename);
+ Vocabulary.write(vocabularyFilename);
+ }
+
+ /**
+ * Integer-labeled, doubly-linked trie with some provisions for packing.
+ *
+ * @author Juri Ganitkevitch
+ *
+ * @param <D> The trie's value type.
+ */
+ class PackingTrie<D extends PackingTrieValue> {
+ int symbol;
+ PackingTrie<D> parent;
+
+ TreeMap<Integer, PackingTrie<D>> children;
+ List<D> values;
+
+ int address;
+
+ PackingTrie() {
+ address = -1;
+
+ symbol = 0;
+ parent = null;
+
+ children = new TreeMap<Integer, PackingTrie<D>>();
+ values = new ArrayList<D>();
+ }
+
+ PackingTrie(PackingTrie<D> parent, int symbol) {
+ this();
+ this.parent = parent;
+ this.symbol = symbol;
+ }
+
+ void add(int[] path, D value) {
+ add(path, 0, value);
+ }
+
+ private void add(int[] path, int index, D value) {
+ if (index == path.length)
+ this.values.add(value);
+ else {
+ PackingTrie<D> child = children.get(path[index]);
+ if (child == null) {
+ child = new PackingTrie<D>(this, path[index]);
+ children.put(path[index], child);
+ }
+ child.add(path, index + 1, value);
+ }
+ }
+
+ /**
+ * Calculate the size (in ints) of a packed trie node. Distinguishes downwards pointing (parent
+ * points to children) from upwards pointing (children point to parent) tries, as well as
+ * skeletal (no data, just the labeled links) and non-skeletal (nodes have a data block)
+ * packing.
+ *
+ * @param downwards Are we packing into a downwards-pointing trie?
+ * @param skeletal Are we packing into a skeletal trie?
+ *
+ * @return Number of bytes the trie node would occupy.
+ */
+ int size(boolean downwards, boolean skeletal) {
+ int size = 0;
+ if (downwards) {
+ // Number of children and links to children.
+ size = 1 + 2 * children.size();
+ } else {
+ // Link to parent.
+ size += 2;
+ }
+ // Non-skeletal packing: number of data items.
+ if (!skeletal)
+ size += 1;
+ // Non-skeletal packing: write size taken up by data items.
+ if (!skeletal && !values.isEmpty())
+ size += values.size() * values.get(0).size();
+
+ return size;
+ }
+
+ void clear() {
+ children.clear();
+ values.clear();
+ }
+ }
+
+ interface PackingTrieValue {
+ int size();
+ }
+
+ class SourceValue implements PackingTrieValue {
+ int lhs;
+ int data;
+ int target;
+
+ public SourceValue() {
+ }
+
+ SourceValue(int lhs, int data) {
+ this.lhs = lhs;
+ this.data = data;
+ }
+
+ void setTarget(int target) {
+ this.target = target;
+ }
+
+ public int size() {
+ return 3;
+ }
+ }
+
+ class TargetValue implements PackingTrieValue {
+ SourceValue parent;
+
+ TargetValue(SourceValue parent) {
+ this.parent = parent;
+ }
+
+ public int size() {
+ return 0;
+ }
+ }
+
+ abstract class PackingBuffer<T> {
+ private byte[] backing;
+ protected ByteBuffer buffer;
+
+ protected ArrayList<Integer> memoryLookup;
+ protected int totalSize;
+ protected ArrayList<Integer> onDiskOrder;
+
+ PackingBuffer() throws IOException {
+ allocate();
+ memoryLookup = new ArrayList<Integer>();
+ onDiskOrder = new ArrayList<Integer>();
+ totalSize = 0;
+ }
+
+ abstract int add(T item);
+
+ // Allocate a reasonably-sized buffer for the feature data.
+ private void allocate() {
+ backing = new byte[approximateMaximumSliceSize * DATA_SIZE_ESTIMATE];
+ buffer = ByteBuffer.wrap(backing);
+ }
+
+ // Reallocate the backing array and buffer, copies data over.
+ protected void reallocate() {
+ if (backing.length == Integer.MAX_VALUE)
+ return;
+ long attempted_length = backing.length * 2l;
+ int new_length;
+ // Detect overflow.
+ if (attempted_length >= Integer.MAX_VALUE)
+ new_length = Integer.MAX_VALUE;
+ else
+ new_length = (int) attempted_length;
+ byte[] new_backing = new byte[new_length];
+ System.arraycopy(backing, 0, new_backing, 0, backing.length);
+ int old_position = buffer.position();
+ ByteBuffer new_buffer = ByteBuffer.wrap(new_backing);
+ new_buffer.position(old_position);
+ buffer = new_buffer;
+ backing = new_backing;
+ }
+
+ /**
+ * Prepare the data buffer for disk writing.
+ */
+ void initialize() {
+ onDiskOrder.clear();
+ }
+
+ /**
+ * Enqueue a data block for later writing.
+ *
+ * @param block_index The index of the data block to add to writing queue.
+ * @return The to-be-written block's output index.
+ */
+ int write(int block_index) {
+ onDiskOrder.add(block_index);
+ return onDiskOrder.size() - 1;
+ }
+
+ /**
+ * Performs the actual writing to disk in the order specified by calls to write() since the last
+ * call to initialize().
+ *
+ * @param out
+ * @throws IOException
+ */
+ void flush(DataOutputStream out) throws IOException {
+ writeHeader(out);
+ int size;
+ int block_address;
+ for (int block_index : onDiskOrder) {
+ block_address = memoryLookup.get(block_index);
+ size = blockSize(block_index);
+ out.write(backing, block_address, size);
+ }
+ }
+
+ void clear() {
+ buffer.clear();
+ memoryLookup.clear();
+ onDiskOrder.clear();
+ }
+
+ boolean overflowing() {
+ return (buffer.position() >= DATA_SIZE_LIMIT);
+ }
+
+ private void writeHeader(DataOutputStream out) throws IOException {
+ if (out.size() == 0) {
+ out.writeInt(onDiskOrder.size());
+ out.writeInt(totalSize);
+ int disk_position = headerSize();
+ for (int block_index : onDiskOrder) {
+ out.writeInt(disk_position);
+ disk_position += blockSize(block_index);
+ }
+ } else {
+ throw new RuntimeException("Got a used stream for header writing.");
+ }
+ }
+
+ private int headerSize() {
+ // One integer for each data block, plus number of blocks and total size.
+ return 4 * (onDiskOrder.size() + 2);
+ }
+
+ private int blockSize(int block_index) {
+ int block_address = memoryLookup.get(block_index);
+ return (block_index < memoryLookup.size() - 1 ? memoryLookup.get(block_index + 1) : totalSize)
+ - block_address;
+ }
+ }
+
+ class FeatureBuffer extends PackingBuffer<TreeMap<Integer, Float>> {
+
+ private IntEncoder idEncoder;
+
+ FeatureBuffer() throws IOException {
+ super();
+ idEncoder = types.getIdEncoder();
+ LOG.info("Encoding feature ids in: {}", idEncoder.getKey());
+ }
+
+ /**
+ * Add a block of features to the buffer.
+ *
+ * @param features TreeMap with the features for one rule.
+ * @return The index of the resulting data block.
+ */
+ int add(TreeMap<Integer, Float> features) {
+ int data_position = buffer.position();
+
+ // Over-estimate how much room this addition will need: for each
+ // feature (ID_SIZE for label, "upper bound" of 4 for the value), plus ID_SIZE for
+ // the number of features. If this won't fit, reallocate the buffer.
+ int size_estimate = (4 + EncoderConfiguration.ID_SIZE) * features.size()
+ + EncoderConfiguration.ID_SIZE;
+ if (buffer.capacity() - buffer.position() <= size_estimate)
+ reallocate();
+
+ // Write features to buffer.
+ idEncoder.write(buffer, features.size());
+ for (Integer k : features.descendingKeySet()) {
+ float v = features.get(k);
+ // Sparse features.
+ if (v != 0.0) {
+ idEncoder.write(buffer, k);
+ encoderConfig.encoder(k).write(buffer, v);
+ }
+ }
+ // Store position the block was written to.
+ memoryLookup.add(data_position);
+ // Update total size (in bytes).
+ totalSize = buffer.position();
+
+ // Return block index.
+ return memoryLookup.size() - 1;
+ }
+ }
+
+ class AlignmentBuffer extends PackingBuffer<byte[]> {
+
+ AlignmentBuffer() throws IOException {
+ super();
+ }
+
+ /**
+ * Add a rule alignments to the buffer.
+ *
+ * @param alignments a byte array with the alignment points for one rule.
+ * @return The index of the resulting data block.
+ */
+ int add(byte[] alignments) {
+ int data_position = buffer.position();
+ int size_estimate = alignments.length + 1;
+ if (buffer.capacity() - buffer.position() <= size_estimate)
+ reallocate();
+
+ // Write alignment points to buffer.
+ buffer.put((byte) (alignments.length / 2));
+ buffer.put(alignments);
+
+ // Store position the block was written to.
+ memoryLookup.add(data_position);
+ // Update total size (in bytes).
+ totalSize = buffer.position();
+ // Return block index.
+ return memoryLookup.size() - 1;
+ }
+ }
+
+ class PackingFileTuple implements Comparable<PackingFileTuple> {
+ private File sourceFile;
+ private File targetLookupFile;
+ private File targetFile;
+
+ private File featureFile;
+ private File alignmentFile;
+
+ PackingFileTuple(String prefix) {
+ sourceFile = new File(output + File.separator + prefix + ".source");
+ targetFile = new File(output + File.separator + prefix + ".target");
+ targetLookupFile = new File(output + File.separator + prefix + ".target.lookup");
+ featureFile = new File(output + File.separator + prefix + ".features");
+
+ alignmentFile = null;
+ if (packAlignments)
+ alignmentFile = new File(output + File.separator + prefix + ".alignments");
+
+ LOG.info("Allocated slice: {}", sourceFile.getAbsolutePath());
+ }
+
+ DataOutputStream getSourceOutput() throws IOException {
+ return getOutput(sourceFile);
+ }
+
+ DataOutputStream getTargetOutput() throws IOException {
+ return getOutput(targetFile);
+ }
+
+ DataOutputStream getTargetLookupOutput() throws IOException {
+ return getOutput(targetLookupFile);
+ }
+
+ DataOutputStream getFeatureOutput() throws IOException {
+ return getOutput(featureFile);
+ }
+
+ DataOutputStream getAlignmentOutput() throws IOException {
+ if (alignmentFile != null)
+ return getOutput(alignmentFile);
+ return null;
+ }
+
+ private DataOutputStream getOutput(File file) throws IOException {
+ if (file.createNewFile()) {
+ return new DataOutputStream(new BufferedOutputStream(new FileOutputStream(file)));
+ } else {
+ throw new RuntimeException("File doesn't exist: " + file.getName());
+ }
+ }
+
+ long getSize() {
+ return sourceFile.length() + targetFile.length() + featureFile.length();
+ }
+
+ @Override
+ public int compareTo(PackingFileTuple o) {
+ if (getSize() > o.getSize()) {
+ return -1;
+ } else if (getSize() < o.getSize()) {
+ return 1;
+ } else {
+ return 0;
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc756709/joshua-core/src/test/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMBerkeleySentenceProbablityTest.java
----------------------------------------------------------------------
diff --cc joshua-core/src/test/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMBerkeleySentenceProbablityTest.java
index 7752785,0000000..cbe6a7f
mode 100644,000000..100644
--- a/joshua-core/src/test/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMBerkeleySentenceProbablityTest.java
+++ b/joshua-core/src/test/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMBerkeleySentenceProbablityTest.java
@@@ -1,64 -1,0 +1,64 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.lm.berkeley_lm;
+
+import edu.berkeley.nlp.lm.ArrayEncodedNgramLanguageModel;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.testng.annotations.AfterMethod;
+import org.testng.annotations.Test;
+
+import static org.testng.Assert.assertTrue;
+import static org.testng.Assert.assertFalse;
+
+import static org.testng.Assert.assertEquals;
+
+public class LMBerkeleySentenceProbablityTest {
+
+ @Test
+ public void verifySentenceLogProbability() {
- LMGrammarBerkeley grammar = new LMGrammarBerkeley(2, "resources/berkeley_lm/lm");
++ LMGrammarBerkeley grammar = new LMGrammarBerkeley(2, "src/test/resources/berkeley_lm/lm");
+ grammar.registerWord("the", 2);
+ grammar.registerWord("chat-rooms", 3);
+ grammar.registerWord("<unk>", 0);
+
+ ArrayEncodedNgramLanguageModel<String> lm = grammar.getLM();
+ float expected =
+ lm.getLogProb(new int[] {}, 0, 0)
+ + lm.getLogProb(new int[] {0}, 0, 1)
+ + lm.getLogProb(new int[] {0, 2}, 0, 2)
+ + lm.getLogProb(new int[] {2, 3}, 0, 2)
+ + lm.getLogProb(new int[] {3, 0}, 0, 2);
+
+ float result = grammar.sentenceLogProbability(new int[] {0, 2, 3, 0}, 2, 0);
+ assertEquals(expected, result, 0.0);
+ }
+
+ @Test
+ public void givenUnknownWord_whenIsOov_thenCorrectlyDetected() {
- LMGrammarBerkeley lm = new LMGrammarBerkeley(2, "resources/berkeley_lm/lm");
++ LMGrammarBerkeley lm = new LMGrammarBerkeley(2, "src/test/resources/berkeley_lm/lm");
+ assertTrue(lm.isOov(Vocabulary.id("UNKNOWN_WORD")));
+ assertFalse(lm.isOov(Vocabulary.id("chat-rooms")));
+ }
+
+ @AfterMethod
+ public void tearDown() {
+ Vocabulary.clear();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc756709/joshua-core/src/test/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
----------------------------------------------------------------------
diff --cc joshua-core/src/test/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
index cc4a94c,0000000..cf04a3d
mode 100644,000000..100644
--- a/joshua-core/src/test/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
+++ b/joshua-core/src/test/java/org/apache/joshua/decoder/ff/lm/berkeley_lm/LMGrammarBerkeleyTest.java
@@@ -1,83 -1,0 +1,83 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.lm.berkeley_lm;
+
+import org.apache.joshua.decoder.Decoder;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.Translation;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.testng.annotations.AfterMethod;
+import org.testng.annotations.DataProvider;
+import org.testng.annotations.Test;
+
+import static org.testng.Assert.assertEquals;
+
+/**
+ * Replacement for test/lm/berkeley/test.sh regression test
+ */
+
+public class LMGrammarBerkeleyTest {
+
+ private static final String INPUT = "the chat-rooms";
+ private static final String EXPECTED_OUTPUT = "glue_0=-2.000000 lm_0=-7.152632\n";
+ private static final String EXPECTED_OUTPUT_WITH_OOV = "glue_0=-2.000000 lm_0=-7.152632 lm_0_oov=0.000000\n";
+ private static final String[] OPTIONS = "-v 1 -output-format %f".split(" ");
+
+ private JoshuaConfiguration joshuaConfig;
+ private Decoder decoder;
+
+ @DataProvider(name = "languageModelFiles")
+ public Object[][] lmFiles() {
- return new Object[][]{{"resources/berkeley_lm/lm"},
- {"resources/berkeley_lm/lm.gz"},
- {"resources/berkeley_lm/lm.berkeleylm"},
- {"resources/berkeley_lm/lm.berkeleylm.gz"}};
++ return new Object[][]{{"src/test/resources/berkeley_lm/lm"},
++ {"src/test/resources/berkeley_lm/lm.gz"},
++ {"src/test/resources/berkeley_lm/lm.berkeleylm"},
++ {"src/test/resources/berkeley_lm/lm.berkeleylm.gz"}};
+ }
+
+ @AfterMethod
+ public void tearDown() throws Exception {
+ decoder.cleanUp();
+ }
+
+ @Test(dataProvider = "languageModelFiles")
+ public void verifyLM(String lmFile) {
+ joshuaConfig = new JoshuaConfiguration();
+ joshuaConfig.processCommandLineOptions(OPTIONS);
+ joshuaConfig.features.add("LanguageModel -lm_type berkeleylm -lm_order 2 -lm_file " + lmFile);
+ decoder = new Decoder(joshuaConfig, null);
+ final String translation = decode(INPUT).toString();
+ assertEquals(translation, EXPECTED_OUTPUT);
+ }
+
+ private Translation decode(String input) {
+ final Sentence sentence = new Sentence(input, 0, joshuaConfig);
+ return decoder.decode(sentence);
+ }
+
+ @Test
+ public void givenLmWithOovFeature_whenDecoder_thenCorrectFeaturesReturned() {
+ joshuaConfig = new JoshuaConfiguration();
+ joshuaConfig.processCommandLineOptions(OPTIONS);
- joshuaConfig.features.add("LanguageModel -lm_type berkeleylm -oov_feature -lm_order 2 -lm_file resources/berkeley_lm/lm");
++ joshuaConfig.features.add("LanguageModel -lm_type berkeleylm -oov_feature -lm_order 2 -lm_file src/test/resources/berkeley_lm/lm");
+ decoder = new Decoder(joshuaConfig, null);
+ final String translation = decode(INPUT).toString();
+ assertEquals(translation, EXPECTED_OUTPUT_WITH_OOV);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc756709/joshua-core/src/test/java/org/apache/joshua/decoder/ff/lm/class_lm/ClassBasedLanguageModelTest.java
----------------------------------------------------------------------
diff --cc joshua-core/src/test/java/org/apache/joshua/decoder/ff/lm/class_lm/ClassBasedLanguageModelTest.java
index 0d7a9c4,0000000..2067f30
mode 100644,000000..100644
--- a/joshua-core/src/test/java/org/apache/joshua/decoder/ff/lm/class_lm/ClassBasedLanguageModelTest.java
+++ b/joshua-core/src/test/java/org/apache/joshua/decoder/ff/lm/class_lm/ClassBasedLanguageModelTest.java
@@@ -1,77 -1,0 +1,77 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.lm.class_lm;
+
+import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.assertTrue;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.Decoder;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.ff.FeatureMap;
+import org.apache.joshua.decoder.ff.FeatureVector;
+import org.apache.joshua.decoder.ff.lm.LanguageModelFF;
+import org.apache.joshua.decoder.ff.tm.OwnerMap;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.util.io.KenLmTestUtil;
+import org.testng.annotations.AfterMethod;
+import org.testng.annotations.BeforeMethod;
+import org.testng.annotations.Test;
+
+/**
+ * This unit test relies on KenLM. If the KenLM library is not found when the test is run all tests will be skipped.
+ */
+public class ClassBasedLanguageModelTest {
+
+ private static final float WEIGHT = 0.5f;
+
+ private LanguageModelFF ff;
+
+ @BeforeMethod
+ public void setUp() {
+ Decoder.resetGlobalState();
+
+ FeatureVector weights = new FeatureVector(1);
+ weights.put(FeatureMap.hashFeature("lm_0"), WEIGHT);
+ String[] args = { "-lm_type", "kenlm", "-lm_order", "9",
- "-lm_file", "./src/test/resources/lm/class_lm/class_lm_9gram.gz",
- "-class_map", "./src/test/resources/lm/class_lm/class.map" };
++ "-lm_file", "src/test/resources/lm/class_lm/class_lm_9gram.gz",
++ "-class_map", "src/test/resources/lm/class_lm/class.map" };
+
+ JoshuaConfiguration config = new JoshuaConfiguration();
+ KenLmTestUtil.Guard(() -> ff = new LanguageModelFF(weights, args, config));
+ }
+
+ @AfterMethod
+ public void tearDown() {
+ Decoder.resetGlobalState();
+ }
+
+ @Test
+ public void givenLmDefinition_whenInitialized_thenInitializationIsCorrect() {
+ assertTrue(ff.isClassLM());
+ assertTrue(ff.isStateful());
+ }
+
+ @Test
+ public void givenRuleWithSingleWord_whenGetRuleId_thenIsMappedToClass() {
+ final int[] target = Vocabulary.addAll(new String[] { "professionalism" });
+ final Rule rule = new Rule(0, null, target, 0, new FeatureVector(0), null, OwnerMap.register(OwnerMap.UNKNOWN_OWNER));
+ assertEquals(Vocabulary.word(ff.getRuleIds(rule)[0]), "13");
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc756709/joshua-core/src/test/java/org/apache/joshua/decoder/kbest_extraction/KBestExtractionTest.java
----------------------------------------------------------------------
diff --cc joshua-core/src/test/java/org/apache/joshua/decoder/kbest_extraction/KBestExtractionTest.java
index 41569aa,0000000..f2cbe7f
mode 100644,000000..100644
--- a/joshua-core/src/test/java/org/apache/joshua/decoder/kbest_extraction/KBestExtractionTest.java
+++ b/joshua-core/src/test/java/org/apache/joshua/decoder/kbest_extraction/KBestExtractionTest.java
@@@ -1,83 -1,0 +1,83 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.kbest_extraction;
+
+import org.apache.joshua.decoder.Decoder;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.Translation;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.apache.joshua.util.io.KenLmTestUtil;
+import org.testng.annotations.AfterMethod;
+import org.testng.annotations.BeforeMethod;
+import org.testng.annotations.Test;
+import org.testng.reporters.Files;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+import static com.google.common.base.Charsets.UTF_8;
+import static java.nio.file.Files.readAllBytes;
+import static org.testng.Assert.assertEquals;
+
+/**
+ * Reimplements the kbest extraction regression test
+ * TODO (fhieber): this test strangely only works with StateMinimizing KenLM.
+ * This is to be investigated
+ */
+
+public class KBestExtractionTest {
+
- private static final String CONFIG = "resources/kbest_extraction/joshua.config";
++ private static final String CONFIG = "src/test/resources/kbest_extraction/joshua.config";
+ private static final String INPUT = "a b c d e";
- private static final Path GOLD_PATH = Paths.get("resources/kbest_extraction/output.scores.gold");
++ private static final Path GOLD_PATH = Paths.get("src/test/resources/kbest_extraction/output.scores.gold");
+
+ private JoshuaConfiguration joshuaConfig = null;
+ private Decoder decoder = null;
+
+ @BeforeMethod
+ public void setUp() throws Exception {
+ //BROKEN
+ joshuaConfig = new JoshuaConfiguration();
+ joshuaConfig.readConfigFile(CONFIG);
+ joshuaConfig.outputFormat = "%i ||| %s ||| %c";
+ KenLmTestUtil.Guard(() -> decoder = new Decoder(joshuaConfig, ""));
+ }
+
+ @AfterMethod
+ public void tearDown() throws Exception {
+ decoder.cleanUp();
+ decoder = null;
+ }
+
+ @Test
+ public void givenInput_whenKbestExtraction_thenOutputIsAsExpected() throws IOException {
+ final String translation = decode(INPUT).toString();
+ final String gold = new String(readAllBytes(GOLD_PATH), UTF_8);
+ Files.writeFile(translation, new File("resources/kbest_extraction/output.actual"));
+ assertEquals(translation, gold);
+ }
+
+ private Translation decode(String input) {
+ final Sentence sentence = new Sentence(input, 0, joshuaConfig);
+ return decoder.decode(sentence);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc756709/joshua-core/src/test/java/org/apache/joshua/decoder/phrase/decode/PhraseDecodingTest.java
----------------------------------------------------------------------
diff --cc joshua-core/src/test/java/org/apache/joshua/decoder/phrase/decode/PhraseDecodingTest.java
index f2fc6a7,0000000..625fe0c
mode 100644,000000..100644
--- a/joshua-core/src/test/java/org/apache/joshua/decoder/phrase/decode/PhraseDecodingTest.java
+++ b/joshua-core/src/test/java/org/apache/joshua/decoder/phrase/decode/PhraseDecodingTest.java
@@@ -1,75 -1,0 +1,90 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+ package org.apache.joshua.decoder.phrase.decode;
+
- import static com.google.common.base.Charsets.UTF_8;
- import static java.nio.file.Files.readAllBytes;
+import static org.testng.Assert.assertEquals;
+
+import java.io.IOException;
- import java.nio.file.Path;
- import java.nio.file.Paths;
+
+import org.apache.joshua.decoder.Decoder;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.Translation;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.apache.joshua.util.io.KenLmTestUtil;
+import org.testng.annotations.AfterMethod;
+import org.testng.annotations.BeforeMethod;
+import org.testng.annotations.Test;
+
+/**
+ * Reimplements the constrained phrase decoding test
+ */
+public class PhraseDecodingTest {
+
- private static final String CONFIG = "resources/phrase_decoder/config";
++ private static final String CONFIG = "src/test/resources/phrase_decoder/config";
+ private static final String INPUT = "una estrategia republicana para obstaculizar la reelecci�n de Obama";
- private static final Path GOLD_PATH = Paths.get("resources/phrase_decoder/output.gold");
-
++ private static final String OUTPUT = "0 ||| a strategy republican to hinder reelection Obama ||| tm_pt_0=-9.702 tm_pt_1=-10.800 tm_pt_2=-7.543 tm_pt_3=-8.555 lm_0=-19.117 OOVPenalty=0.000 WordPenalty=-3.040 Distortion=0.000 PhrasePenalty=5.000 ||| -7.496";
++ private static final String OUTPUT_WITH_ALIGNMENTS = "0 ||| a strategy |0-1| republican |2-2| to hinder |3-4| reelection |5-6| Obama |7-8| ||| tm_pt_0=-9.702 tm_pt_1=-10.800 tm_pt_2=-7.543 tm_pt_3=-8.555 lm_0=-19.117 OOVPenalty=0.000 WordPenalty=-3.040 Distortion=0.000 PhrasePenalty=5.000 ||| -7.496";
++
+ private JoshuaConfiguration joshuaConfig = null;
+ private Decoder decoder = null;
+
+ @BeforeMethod
+ public void setUp() throws Exception {
+ joshuaConfig = new JoshuaConfiguration();
+ joshuaConfig.readConfigFile(CONFIG);
+ KenLmTestUtil.Guard(() -> decoder = new Decoder(joshuaConfig, ""));
+ }
+
+ @AfterMethod
+ public void tearDown() throws Exception {
+ decoder.cleanUp();
+ decoder = null;
+ }
+
- @Test(enabled = false)
++ @Test(enabled = true)
+ public void givenInput_whenPhraseDecoding_thenOutputIsAsExpected() throws IOException {
- final String translation = decode(INPUT).toString();
- final String gold = new String(readAllBytes(GOLD_PATH), UTF_8);
- assertEquals(gold, translation);
++ final String translation = decode(INPUT).toString().trim();
++ final String gold = OUTPUT;
++ assertEquals(translation, gold);
++ }
++
++ @Test(enabled = false)
++ public void givenInput_whenPhraseDecodingWithAlignments_thenOutputHasAlignments() throws IOException {
++ final String translation = decode(INPUT).toString().trim();
++ final String gold = OUTPUT_WITH_ALIGNMENTS;
++ assertEquals(translation, gold);
++ }
++
++ @Test(enabled = true)
++ public void givenInput_whenPhraseDecoding_thenInputCanBeRetrieved() throws IOException {
++ String outputFormat = joshuaConfig.outputFormat;
++ joshuaConfig.outputFormat = "%e";
++ final String translation = decode(INPUT).toString().trim();
++ joshuaConfig.outputFormat = outputFormat;
++ final String gold = INPUT;
++ assertEquals(translation, gold);
+ }
+
+ private Translation decode(String input) {
+ final Sentence sentence = new Sentence(input, 0, joshuaConfig);
++// joshuaConfig.setVerbosity(2);
+ return decoder.decode(sentence);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc756709/joshua-core/src/test/java/org/apache/joshua/system/KenLmTest.java
----------------------------------------------------------------------
diff --cc joshua-core/src/test/java/org/apache/joshua/system/KenLmTest.java
index 40514cd,0000000..74baef3
mode 100644,000000..100644
--- a/joshua-core/src/test/java/org/apache/joshua/system/KenLmTest.java
+++ b/joshua-core/src/test/java/org/apache/joshua/system/KenLmTest.java
@@@ -1,100 -1,0 +1,100 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.system;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.ff.lm.KenLM;
+import org.apache.joshua.util.io.KenLmTestUtil;
+import org.testng.annotations.AfterMethod;
+import org.testng.annotations.BeforeMethod;
+import org.testng.annotations.Test;
+
+import static org.apache.joshua.corpus.Vocabulary.registerLanguageModel;
+import static org.apache.joshua.corpus.Vocabulary.unregisterLanguageModels;
+import static org.testng.Assert.assertTrue;
+import static org.testng.AssertJUnit.assertEquals;
+import static org.testng.AssertJUnit.assertFalse;
+
+/**
+ * KenLM JNI interface tests.
+ * Loads libken.{so,dylib}.
+ * If run in Eclipse, add -Djava.library.path=./lib to JVM arguments
+ * of the run configuration.
+ */
+
+public class KenLmTest {
+
- private static final String LANGUAGE_MODEL_PATH = "resources/kenlm/oilers.kenlm";
++ private static final String LANGUAGE_MODEL_PATH = "src/test/resources/kenlm/oilers.kenlm";
+ private KenLM kenLm;
+
+ @Test
+ public void givenKenLm_whenQueryingForNgramProbability_thenProbIsCorrect() {
+ // GIVEN
+ KenLmTestUtil.Guard(() -> kenLm = new KenLM(3, LANGUAGE_MODEL_PATH));
+
+ int[] words = Vocabulary.addAll("Wayne Gretzky");
+ registerLanguageModel(kenLm);
+
+ // WHEN
+ float probability = kenLm.prob(words);
+
+ // THEN
+ assertEquals("Found the wrong probability for 2-gram \"Wayne Gretzky\"", -0.99f, probability,
+ Float.MIN_VALUE);
+ }
+
+ @Test
+ public void givenKenLm_whenQueryingForNgramProbability_thenIdAndStringMethodsReturnTheSame() {
+ // GIVEN
+ KenLmTestUtil.Guard(() -> kenLm = new KenLM(LANGUAGE_MODEL_PATH));
+
+ registerLanguageModel(kenLm);
+ String sentence = "Wayne Gretzky";
+ String[] words = sentence.split("\\s+");
+ int[] ids = Vocabulary.addAll(sentence);
+
+ // WHEN
+ float prob_string = kenLm.prob(words);
+ float prob_id = kenLm.prob(ids);
+
+ // THEN
+ assertEquals("ngram probabilities differ for word and id based n-gram query", prob_string, prob_id,
+ Float.MIN_VALUE);
+
+ }
+
+ @Test
+ public void givenKenLm_whenIsKnownWord_thenReturnValuesAreCorrect() {
+ KenLmTestUtil.Guard(() -> kenLm = new KenLM(LANGUAGE_MODEL_PATH));
+ assertTrue(kenLm.isKnownWord("Wayne"));
+ assertFalse(kenLm.isKnownWord("Wayne2222"));
+ }
+
+ @BeforeMethod
+ public void setUp() throws Exception {
+ Vocabulary.clear();
+ unregisterLanguageModels();
+ }
+
+ @AfterMethod
+ public void tearDown() throws Exception {
+ Vocabulary.clear();
+ unregisterLanguageModels();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc756709/joshua-core/src/test/java/org/apache/joshua/system/LmOovFeatureTest.java
----------------------------------------------------------------------
diff --cc joshua-core/src/test/java/org/apache/joshua/system/LmOovFeatureTest.java
index 9e2f622,0000000..84789ce
mode 100644,000000..100644
--- a/joshua-core/src/test/java/org/apache/joshua/system/LmOovFeatureTest.java
+++ b/joshua-core/src/test/java/org/apache/joshua/system/LmOovFeatureTest.java
@@@ -1,76 -1,0 +1,76 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+ package org.apache.joshua.system;
+
+import static org.testng.Assert.assertEquals;
+
+import java.io.IOException;
+
+import org.apache.joshua.decoder.Decoder;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.Translation;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.testng.annotations.AfterMethod;
+import org.testng.annotations.BeforeMethod;
+import org.testng.annotations.Test;
+
+public class LmOovFeatureTest {
+
- private static final String CONFIG = "resources/lm_oov/joshua.config";
++ private static final String CONFIG = "src/test/resources/lm_oov/joshua.config";
+ private static final String INPUT = "a chat-rooms full";
+ // expecting 2 lm oovs ('a' & 'full') and 2 grammar OOVs ('chat-rooms' & 'full') and score -198.000
+ private static final String EXPECTED_FEATURES = "lm_0=-206.718124 lm_0_oov=2.000000 OOVPenalty=-200.000000 pt_0=2.000000 glue_0=-3.000000 | -198.000";
+
+ private JoshuaConfiguration joshuaConfig = null;
+ private Decoder decoder = null;
+
+ @BeforeMethod
+ public void setUp() throws Exception {
+ joshuaConfig = new JoshuaConfiguration();
+ joshuaConfig.readConfigFile(CONFIG);
+ joshuaConfig.outputFormat = "%f | %c";
+ decoder = new Decoder(joshuaConfig, "");
+ }
+
+ @AfterMethod
+ public void tearDown() throws Exception {
+ decoder.cleanUp();
+ decoder = null;
+ }
+
+ @Test
+ public void givenInputWithDifferentOovTypes_whenDecode_thenFeaturesAreAsExpected() throws IOException {
+ final String translation = decode(INPUT).toString().trim();
+ System.out.println(translation);
+ assertEquals(translation, EXPECTED_FEATURES);
+ }
+
+ private Translation decode(String input) {
+ final Sentence sentence = new Sentence(input, 0, joshuaConfig);
+ return decoder.decode(sentence);
+ }
+
+ public static void main(String[] args) throws Exception {
+
+ LmOovFeatureTest test = new LmOovFeatureTest();
+ test.setUp();
+ test.givenInputWithDifferentOovTypes_whenDecode_thenFeaturesAreAsExpected();
+ test.tearDown();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc756709/joshua-core/src/test/java/org/apache/joshua/system/MultithreadedTranslationTests.java
----------------------------------------------------------------------
diff --cc joshua-core/src/test/java/org/apache/joshua/system/MultithreadedTranslationTests.java
index 092dbc6,0000000..7b1c47f
mode 100644,000000..100644
--- a/joshua-core/src/test/java/org/apache/joshua/system/MultithreadedTranslationTests.java
+++ b/joshua-core/src/test/java/org/apache/joshua/system/MultithreadedTranslationTests.java
@@@ -1,155 -1,0 +1,155 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.system;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+
+import org.apache.joshua.decoder.Decoder;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.Translation;
+import org.apache.joshua.decoder.Translations;
+import org.apache.joshua.decoder.io.TranslationRequestStream;
+import org.testng.annotations.AfterMethod;
+import org.testng.annotations.BeforeMethod;
+import org.testng.annotations.Test;
+
+import static org.testng.Assert.assertTrue;
+
+/**
+ * Integration test for multithreaded Joshua decoder tests. Grammar used is a
+ * toy packed grammar.
+ *
+ * @author kellens
+ */
+public class MultithreadedTranslationTests {
+
+ private JoshuaConfiguration joshuaConfig = null;
+ private Decoder decoder = null;
+ private static final String INPUT = "A K B1 U Z1 Z2 B2 C";
+ private int previousLogLevel;
+ private final static long NANO_SECONDS_PER_SECOND = 1_000_000_000;
+
+ @BeforeMethod
+ public void setUp() throws Exception {
+ joshuaConfig = new JoshuaConfiguration();
+ joshuaConfig.search_algorithm = "cky";
+ joshuaConfig.mark_oovs = false;
+ joshuaConfig.pop_limit = 100;
+ joshuaConfig.use_unique_nbest = false;
+ joshuaConfig.include_align_index = false;
+ joshuaConfig.topN = 0;
- joshuaConfig.tms.add("thrax -owner pt -maxspan 20 -path resources/wa_grammar.packed");
- joshuaConfig.tms.add("thrax -owner glue -maxspan -1 -path resources/grammar.glue");
++ joshuaConfig.tms.add("thrax -owner pt -maxspan 20 -path src/test/resources/wa_grammar.packed");
++ joshuaConfig.tms.add("thrax -owner glue -maxspan -1 -path src/test/resources/grammar.glue");
+ joshuaConfig.goal_symbol = "[GOAL]";
+ joshuaConfig.default_non_terminal = "[X]";
+ joshuaConfig.features.add("OOVPenalty");
+ joshuaConfig.weights.add("tm_pt_0 1");
+ joshuaConfig.weights.add("tm_pt_1 1");
+ joshuaConfig.weights.add("tm_pt_2 1");
+ joshuaConfig.weights.add("tm_pt_3 1");
+ joshuaConfig.weights.add("tm_pt_4 1");
+ joshuaConfig.weights.add("tm_pt_5 1");
+ joshuaConfig.weights.add("tm_glue_0 1");
+ joshuaConfig.weights.add("OOVPenalty 2");
+ joshuaConfig.num_parallel_decoders = 500; // This will enable 500 parallel
+ // decoders to run at once.
+ // Useful to help flush out
+ // concurrency errors in
+ // underlying
+ // data-structures.
+ this.decoder = new Decoder(joshuaConfig, ""); // Second argument
+ // (configFile)
+ // is not even used by the
+ // constructor/initialize.
+
+ previousLogLevel = Decoder.VERBOSE;
+ Decoder.VERBOSE = 0;
+ }
+
+ @AfterMethod
+ public void tearDown() throws Exception {
+ this.decoder.cleanUp();
+ this.decoder = null;
+ Decoder.VERBOSE = previousLogLevel;
+ }
+
+
+
+ // This test was created specifically to reproduce a multithreaded issue
+ // related to mapped byte array access in the PackedGrammer getAlignmentArray
+ // function.
+
+ // We'll test the decoding engine using N = 10,000 identical inputs. This
+ // should be sufficient to induce concurrent data access for many shared
+ // data structures.
+
+ @Test
+ public void givenPackedGrammar_whenNTranslationsCalledConcurrently_thenReturnNResults() throws IOException {
+ // GIVEN
+
+ int inputLines = 10000;
+ joshuaConfig.use_structured_output = true; // Enabled alignments.
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < inputLines; i++) {
+ sb.append(INPUT + "\n");
+ }
+
+ // Append a large string together to simulate N requests to the decoding
+ // engine.
+ TranslationRequestStream req = new TranslationRequestStream(
+ new BufferedReader(new InputStreamReader(new ByteArrayInputStream(sb.toString()
+ .getBytes(Charset.forName("UTF-8"))))), joshuaConfig);
+
+ ByteArrayOutputStream output = new ByteArrayOutputStream();
+
+ // WHEN
+ // Translate all spans in parallel.
+ Translations translations = this.decoder.decodeAll(req);
+
+ ArrayList<Translation> translationResults = new ArrayList<Translation>();
+
+
+ final long translationStartTime = System.nanoTime();
+ try {
+ for (Translation t: translations)
+ translationResults.add(t);
+ } finally {
+ if (output != null) {
+ try {
+ output.close();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ }
+
+ final long translationEndTime = System.nanoTime();
+ final double pipelineLoadDurationInSeconds = (translationEndTime - translationStartTime) / ((double)NANO_SECONDS_PER_SECOND);
+ System.err.println(String.format("%.2f seconds", pipelineLoadDurationInSeconds));
+
+ // THEN
+ assertTrue(translationResults.size() == inputLines);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/dc756709/joshua-core/src/test/java/org/apache/joshua/system/StructuredOutputTest.java
----------------------------------------------------------------------
diff --cc joshua-core/src/test/java/org/apache/joshua/system/StructuredOutputTest.java
index 0cc8721,0000000..1c9a6fe
mode 100644,000000..100644
--- a/joshua-core/src/test/java/org/apache/joshua/system/StructuredOutputTest.java
+++ b/joshua-core/src/test/java/org/apache/joshua/system/StructuredOutputTest.java
@@@ -1,114 -1,0 +1,114 @@@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.system;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.joshua.decoder.Decoder;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.Translation;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.testng.Assert;
+import org.testng.annotations.AfterMethod;
+import org.testng.annotations.BeforeMethod;
+import org.testng.annotations.Test;
+
+/**
+ * Integration test for the complete Joshua decoder using a toy grammar that translates
+ * a bunch of capital letters to lowercase letters. Rules in the test grammar
+ * drop and generate additional words and simulate reordering of rules, so that
+ * proper extraction of word alignments can be tested.
+ *
+ * @author fhieber
+ */
+public class StructuredOutputTest {
+
+ private JoshuaConfiguration joshuaConfig = null;
+ private Decoder decoder = null;
+ private Translation translation = null;
+ private static final String input = "A K B1 U Z1 Z2 B2 C";
+ private static final String expectedTranslation = "a b n1 u z c1 k1 k2 k3 n1 n2 n3 c2";
+ private static final String expectedWordAlignmentString = "0-0 2-1 6-1 3-3 4-4 5-4 7-5 1-6 1-7 1-8 7-12";
+ private static final List<List<Integer>> expectedWordAlignment = Arrays.asList(
+ Arrays.asList(0), Arrays.asList(2, 6), Arrays.asList(), Arrays.asList(3),
+ Arrays.asList(4, 5), Arrays.asList(7), Arrays.asList(1),
+ Arrays.asList(1), Arrays.asList(1), Arrays.asList(), Arrays.asList(),
+ Arrays.asList(), Arrays.asList(7));
+ private static final double expectedScore = -17.0;
+
+ @BeforeMethod
+ public void setUp() throws Exception {
+ joshuaConfig = new JoshuaConfiguration();
+ joshuaConfig.search_algorithm = "cky";
+ joshuaConfig.mark_oovs = false;
+ joshuaConfig.pop_limit = 100;
+ joshuaConfig.use_unique_nbest = false;
+ joshuaConfig.include_align_index = false;
+ joshuaConfig.topN = 0;
- joshuaConfig.tms.add("thrax -owner pt -maxspan 20 -path resources/wa_grammar");
- joshuaConfig.tms.add("thrax -owner glue -maxspan -1 -path resources/grammar.glue");
++ joshuaConfig.tms.add("thrax -owner pt -maxspan 20 -path src/test/resources/wa_grammar");
++ joshuaConfig.tms.add("thrax -owner glue -maxspan -1 -path src/test/resources/grammar.glue");
+ joshuaConfig.goal_symbol = "[GOAL]";
+ joshuaConfig.default_non_terminal = "[X]";
+ joshuaConfig.features.add("OOVPenalty");
+ joshuaConfig.weights.add("pt_0 -1");
+ joshuaConfig.weights.add("pt_1 -1");
+ joshuaConfig.weights.add("pt_2 -1");
+ joshuaConfig.weights.add("pt_3 -1");
+ joshuaConfig.weights.add("pt_4 -1");
+ joshuaConfig.weights.add("pt_5 -1");
+ joshuaConfig.weights.add("glue_0 -1");
+ joshuaConfig.weights.add("OOVPenalty 2");
+ decoder = new Decoder(joshuaConfig, ""); // second argument (configFile
+ // is not even used by the
+ // constructor/initialize)
+ }
+
+ @AfterMethod
+ public void tearDown() throws Exception {
+ decoder.cleanUp();
+ decoder = null;
+ translation = null;
+ }
+
+ private Translation decode(String input) {
+ Sentence sentence = new Sentence(input, 0, joshuaConfig);
+ return decoder.decode(sentence);
+ }
+
+ @Test
+ public void test() {
+
+ // test standard output
+ joshuaConfig.use_structured_output = false;
+ joshuaConfig.outputFormat = "%s | %a ";
+ translation = decode(input);
+ Assert.assertEquals(translation.toString().trim(), expectedTranslation + " | " + expectedWordAlignmentString);
+
+ // test structured output
+ joshuaConfig.use_structured_output = true; // set structured output creation to true
+ translation = decode(input);
+ Assert.assertEquals(translation.getStructuredTranslations().get(0).getTranslationString(), expectedTranslation);
+ Assert.assertEquals(translation.getStructuredTranslations().get(0).getTranslationTokens(), Arrays.asList(expectedTranslation.split("\\s+")));
+ Assert.assertEquals(translation.getStructuredTranslations().get(0).getTranslationScore(), expectedScore, 0.00001);
+ Assert.assertEquals(translation.getStructuredTranslations().get(0).getTranslationWordAlignments(), expectedWordAlignment);
+ Assert.assertEquals(translation.getStructuredTranslations().get(0).getTranslationWordAlignments().size(), translation
+ .getStructuredTranslations().get(0).getTranslationTokens().size());
+ }
+}