You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/05/31 15:14:35 UTC
[4/9] incubator-joshua git commit: StructuredTranslation objects can
now be generated from KBest Derivations. This gives way to expose k-best
lists if Joshua is used as a library. Also fixed some code issues and tests.
StructuredTranslation objects can now be generated from KBest Derivations. This gives way to expose k-best lists if Joshua is used as a library.
Also fixed some code issues and tests.
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/e3673e98
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/e3673e98
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/e3673e98
Branch: refs/heads/master
Commit: e3673e988d5d27f93e69cf270dd4056547a752b9
Parents: 4d73c17
Author: Felix Hieber <fh...@amazon.com>
Authored: Tue Mar 15 11:26:29 2016 +0100
Committer: Felix Hieber <fh...@amazon.com>
Committed: Mon May 30 09:09:10 2016 +0200
----------------------------------------------------------------------
src/joshua/decoder/StructuredTranslation.java | 67 ++++------
.../decoder/StructuredTranslationFactory.java | 101 +++++++++++++++
src/joshua/decoder/Translation.java | 123 ++++++++++++-------
.../decoder/hypergraph/KBestExtractor.java | 85 +++++++++----
.../decoder/hypergraph/WordAlignmentState.java | 103 +++++++++-------
src/joshua/decoder/io/JSONMessage.java | 2 +-
tst/joshua/corpus/VocabularyTest.java | 26 ++--
.../kbest_extraction/KBestExtractionTest.java | 10 +-
.../ConstrainedPhraseDecodingTest.java | 10 +-
.../phrase/decode/PhraseDecodingTest.java | 10 +-
tst/joshua/system/KenLmTest.java | 6 +-
.../system/MultithreadedTranslationTests.java | 10 +-
tst/joshua/system/StructuredOutputTest.java | 16 +--
.../system/StructuredTranslationTest.java | 107 ++++++++++++----
tst/joshua/util/FormatUtilsTest.java | 8 +-
15 files changed, 446 insertions(+), 238 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e3673e98/src/joshua/decoder/StructuredTranslation.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/StructuredTranslation.java b/src/joshua/decoder/StructuredTranslation.java
index 7b2185f..2a7af73 100644
--- a/src/joshua/decoder/StructuredTranslation.java
+++ b/src/joshua/decoder/StructuredTranslation.java
@@ -18,27 +18,18 @@
*/
package joshua.decoder;
-import static java.util.Arrays.asList;
-import static java.util.Collections.emptyList;
-import static joshua.decoder.hypergraph.ViterbiExtractor.getViterbiFeatures;
-import static joshua.decoder.hypergraph.ViterbiExtractor.getViterbiString;
-import static joshua.decoder.hypergraph.ViterbiExtractor.getViterbiWordAlignmentList;
-import static joshua.util.FormatUtils.removeSentenceMarkers;
-
import java.util.List;
import java.util.Map;
-import joshua.decoder.ff.FeatureFunction;
-import joshua.decoder.hypergraph.HyperGraph;
import joshua.decoder.segment_file.Sentence;
/**
- * structuredTranslation provides a more structured access to translation
- * results than the Translation class.
- * Members of instances of this class can be used upstream.
- * <br/>
- * TODO:
- * Enable K-Best extraction.
+ * A StructuredTranslation instance provides a more structured access to
+ * translation results than the string-based Translation class.
+ * This is useful if the decoder is encapsulated in a larger project, instead
+ * of simply writing to a file or stdout.
+ * StructuredTranslation encodes all relevant information about a derivation,
+ * namely output string, tokens, score, features, and word alignment.
*
* @author fhieber
*/
@@ -52,39 +43,23 @@ public class StructuredTranslation {
private final Map<String,Float> translationFeatures;
private final float extractionTime;
- public StructuredTranslation(final Sentence sourceSentence,
- final HyperGraph hypergraph,
- final List<FeatureFunction> featureFunctions) {
-
- final long startTime = System.currentTimeMillis();
-
- this.sourceSentence = sourceSentence;
- this.translationString = removeSentenceMarkers(getViterbiString(hypergraph));
- this.translationTokens = extractTranslationTokens();
- this.translationScore = extractTranslationScore(hypergraph);
- this.translationFeatures = getViterbiFeatures(hypergraph, featureFunctions, sourceSentence).getMap();
- this.translationWordAlignments = getViterbiWordAlignmentList(hypergraph);
- this.extractionTime = (System.currentTimeMillis() - startTime) / 1000.0f;
- }
-
- private float extractTranslationScore(final HyperGraph hypergraph) {
- if (hypergraph == null) {
- return 0;
- } else {
- return hypergraph.goalNode.getScore();
- }
+ public StructuredTranslation(
+ final Sentence sourceSentence,
+ final String translationString,
+ final List<String> translationTokens,
+ final float translationScore,
+ final List<List<Integer>> translationWordAlignments,
+ final Map<String,Float> translationFeatures,
+ final float extractionTime) {
+ this.sourceSentence = sourceSentence;
+ this.translationString = translationString;
+ this.translationTokens = translationTokens;
+ this.translationScore = translationScore;
+ this.translationWordAlignments = translationWordAlignments;
+ this.translationFeatures = translationFeatures;
+ this.extractionTime = extractionTime;
}
- private List<String> extractTranslationTokens() {
- if (translationString.isEmpty()) {
- return emptyList();
- } else {
- return asList(translationString.split("\\s+"));
- }
- }
-
- // Getters to use upstream
-
public Sentence getSourceSentence() {
return sourceSentence;
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e3673e98/src/joshua/decoder/StructuredTranslationFactory.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/StructuredTranslationFactory.java b/src/joshua/decoder/StructuredTranslationFactory.java
new file mode 100644
index 0000000..c6bfb50
--- /dev/null
+++ b/src/joshua/decoder/StructuredTranslationFactory.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package joshua.decoder;
+
+import static java.util.Arrays.asList;
+import static java.util.Collections.emptyList;
+import static joshua.decoder.hypergraph.ViterbiExtractor.getViterbiFeatures;
+import static joshua.decoder.hypergraph.ViterbiExtractor.getViterbiString;
+import static joshua.decoder.hypergraph.ViterbiExtractor.getViterbiWordAlignmentList;
+import static joshua.util.FormatUtils.removeSentenceMarkers;
+
+import java.util.List;
+
+import joshua.decoder.ff.FeatureFunction;
+import joshua.decoder.hypergraph.HyperGraph;
+import joshua.decoder.hypergraph.KBestExtractor.DerivationState;
+import joshua.decoder.segment_file.Sentence;
+
+/**
+ * This factory provides methods to create StructuredTranslation objects
+ * from either Viterbi derivations or KBest derivations.
+ *
+ * @author fhieber
+ */
+public class StructuredTranslationFactory {
+
+ /**
+ * Returns a StructuredTranslation instance from the Viterbi derivation.
+ * @return A StructuredTranslation object representing the Viterbi derivation.
+ */
+ public static StructuredTranslation fromViterbiDerivation(
+ final Sentence sourceSentence,
+ final HyperGraph hypergraph,
+ final List<FeatureFunction> featureFunctions) {
+ final long startTime = System.currentTimeMillis();
+ final String translationString = removeSentenceMarkers(getViterbiString(hypergraph));
+ return new StructuredTranslation(
+ sourceSentence,
+ translationString,
+ extractTranslationTokens(translationString),
+ extractTranslationScore(hypergraph),
+ getViterbiWordAlignmentList(hypergraph),
+ getViterbiFeatures(hypergraph, featureFunctions, sourceSentence).getMap(),
+ (System.currentTimeMillis() - startTime) / 1000.0f);
+ }
+
+ /**
+ * Returns a StructuredTranslation instance from a KBest DerivationState.
+ * @param sourceSentence Sentence object representing the source.
+ * @param derivationState the KBest DerivationState.
+ * @return A StructuredTranslation object representing the derivation encoded by derivationState.
+ */
+ public static StructuredTranslation fromKBestDerivation(
+ final Sentence sourceSentence,
+ final DerivationState derivationState) {
+ final long startTime = System.currentTimeMillis();
+ final String translationString = removeSentenceMarkers(derivationState.getHypothesis());
+ return new StructuredTranslation(
+ sourceSentence,
+ translationString,
+ extractTranslationTokens(translationString),
+ derivationState.getModelCost(),
+ derivationState.getWordAlignmentList(),
+ derivationState.getFeatures().getMap(),
+ (System.currentTimeMillis() - startTime) / 1000.0f);
+ }
+
+ private static float extractTranslationScore(final HyperGraph hypergraph) {
+ if (hypergraph == null) {
+ return 0;
+ } else {
+ return hypergraph.goalNode.getScore();
+ }
+ }
+
+ private static List<String> extractTranslationTokens(final String translationString) {
+ if (translationString.isEmpty()) {
+ return emptyList();
+ } else {
+ return asList(translationString.split("\\s+"));
+ }
+ }
+
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e3673e98/src/joshua/decoder/Translation.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/Translation.java b/src/joshua/decoder/Translation.java
index 8004d9f..03ea62f 100644
--- a/src/joshua/decoder/Translation.java
+++ b/src/joshua/decoder/Translation.java
@@ -18,6 +18,8 @@
*/
package joshua.decoder;
+import static java.util.Arrays.asList;
+import static joshua.decoder.StructuredTranslationFactory.fromViterbiDerivation;
import static joshua.decoder.hypergraph.ViterbiExtractor.getViterbiFeatures;
import static joshua.decoder.hypergraph.ViterbiExtractor.getViterbiString;
import static joshua.decoder.hypergraph.ViterbiExtractor.getViterbiWordAlignments;
@@ -42,6 +44,7 @@ import joshua.decoder.segment_file.Sentence;
* DecoderThread instances to the InputHandler, where they are assembled in order for output.
*
* @author Matt Post <po...@cs.jhu.edu>
+ * @author Felix Hieber <fh...@amazon.com>
*/
public class Translation {
@@ -53,17 +56,44 @@ public class Translation {
*/
private String output = null;
- private StructuredTranslation structuredTranslation = null;
+ /**
+ * Stores the list of StructuredTranslations.
+ * If joshuaConfig.topN == 0, will only contain the Viterbi translation.
+ * Else it will use KBestExtractor to populate this list.
+ */
+ private List<StructuredTranslation> structuredTranslations = null;
public Translation(Sentence source, HyperGraph hypergraph,
List<FeatureFunction> featureFunctions, JoshuaConfiguration joshuaConfiguration) {
this.source = source;
+ /**
+ * Structured output from Joshua provides a way to programmatically access translation results
+ * from downstream applications, instead of writing results as strings to an output buffer.
+ */
if (joshuaConfiguration.use_structured_output) {
- structuredTranslation = new StructuredTranslation(
- source, hypergraph, featureFunctions);
- this.output = structuredTranslation.getTranslationString();
+ if (joshuaConfiguration.topN == 0) {
+ /*
+ * Obtain Viterbi StructuredTranslation
+ */
+ StructuredTranslation translation = fromViterbiDerivation(source, hypergraph, featureFunctions);
+ this.output = translation.getTranslationString();
+ structuredTranslations = asList(translation);
+
+ } else {
+ /*
+ * Get K-Best list of StructuredTranslations
+ */
+ final KBestExtractor kBestExtractor = new KBestExtractor(source, featureFunctions, Decoder.weights, false, joshuaConfiguration);
+ structuredTranslations = kBestExtractor.KbestExtractOnHG(hypergraph, joshuaConfiguration.topN);
+ if (structuredTranslations.isEmpty()) {
+ this.output = "";
+ } else {
+ this.output = structuredTranslations.get(0).getTranslationString();
+ }
+ // TODO: We omit the BLEU rescoring for now since it is not clear whether it works at all and what the desired output is below.
+ }
} else {
@@ -71,7 +101,9 @@ public class Translation {
BufferedWriter out = new BufferedWriter(sw);
try {
+
if (hypergraph != null) {
+
if (!joshuaConfiguration.hypergraphFilePattern.equals("")) {
hypergraph.dump(String.format(joshuaConfiguration.hypergraphFilePattern, source.id()), featureFunctions);
}
@@ -132,44 +164,26 @@ public class Translation {
Decoder.LOG(1, String.format("Input %d: %d-best extraction took %.3f seconds", id(),
joshuaConfiguration.topN, seconds));
- } else {
-
- // Failed translations and blank lines get empty formatted outputs
- // @formatter:off
- String outputString = joshuaConfiguration.outputFormat
- .replace("%s", source.source())
- .replace("%e", "")
- .replace("%S", "")
- .replace("%t", "()")
- .replace("%i", Integer.toString(source.id()))
- .replace("%f", "")
- .replace("%c", "0.000");
- // @formatter:on
-
- out.write(outputString);
- out.newLine();
- }
+ } else {
+
+ // Failed translations and blank lines get empty formatted outputs
+ out.write(getFailedTranslationOutput(source, joshuaConfiguration));
+ out.newLine();
+
+ }
out.flush();
+
} catch (IOException e) {
- e.printStackTrace();
- System.exit(1);
+ throw new RuntimeException(e);
}
this.output = sw.toString();
}
-
- /*
- * KenLM hack. If using KenLMFF, we need to tell KenLM to delete the pool used to create chart
- * objects for this sentence.
- */
- for (FeatureFunction feature : featureFunctions) {
- if (feature instanceof StateMinimizingLanguageModel) {
- ((StateMinimizingLanguageModel) feature).destroyPool(getSourceSentence().id());
- break;
- }
- }
+
+ // remove state from StateMinimizingLanguageModel instances in features.
+ destroyKenLMStates(featureFunctions);
}
@@ -186,17 +200,42 @@ public class Translation {
return output;
}
+ private String getFailedTranslationOutput(final Sentence source, final JoshuaConfiguration joshuaConfiguration) {
+ return joshuaConfiguration.outputFormat
+ .replace("%s", source.source())
+ .replace("%e", "")
+ .replace("%S", "")
+ .replace("%t", "()")
+ .replace("%i", Integer.toString(source.id()))
+ .replace("%f", "")
+ .replace("%c", "0.000");
+ }
+
/**
- * Returns the StructuredTranslation object
- * if JoshuaConfiguration.construct_structured_output == True.
- * @throws RuntimeException if StructuredTranslation object not set.
- * @return
+ * Returns the StructuredTranslations
+ * if JoshuaConfiguration.use_structured_output == True.
+ * @throws RuntimeException if JoshuaConfiguration.use_structured_output == False.
+ * @return List of StructuredTranslations.
*/
- public StructuredTranslation getStructuredTranslation() {
- if (structuredTranslation == null) {
- throw new RuntimeException("No StructuredTranslation object created. You should set JoshuaConfigration.construct_structured_output = true");
+ public List<StructuredTranslation> getStructuredTranslations() {
+ if (structuredTranslations == null) {
+ throw new RuntimeException(
+ "No StructuredTranslation objects created. You should set JoshuaConfigration.use_structured_output = true");
+ }
+ return structuredTranslations;
+ }
+
+ /**
+ * KenLM hack. If using KenLMFF, we need to tell KenLM to delete the pool used to create chart
+ * objects for this sentence.
+ */
+ private void destroyKenLMStates(final List<FeatureFunction> featureFunctions) {
+ for (FeatureFunction feature : featureFunctions) {
+ if (feature instanceof StateMinimizingLanguageModel) {
+ ((StateMinimizingLanguageModel) feature).destroyPool(getSourceSentence().id());
+ break;
+ }
}
- return structuredTranslation;
}
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e3673e98/src/joshua/decoder/hypergraph/KBestExtractor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/KBestExtractor.java b/src/joshua/decoder/hypergraph/KBestExtractor.java
index 6dd3207..d6e7c60 100644
--- a/src/joshua/decoder/hypergraph/KBestExtractor.java
+++ b/src/joshua/decoder/hypergraph/KBestExtractor.java
@@ -1,4 +1,4 @@
-/*
+ /*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
@@ -18,15 +18,16 @@
*/
package joshua.decoder.hypergraph;
-import static joshua.util.FormatUtils.unescapeSpecialSymbols;
+import static java.util.Collections.emptyList;
import static joshua.util.FormatUtils.removeSentenceMarkers;
+import static joshua.util.FormatUtils.unescapeSpecialSymbols;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.OutputStreamWriter;
+import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
-import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
@@ -35,6 +36,8 @@ import java.util.PriorityQueue;
import joshua.corpus.Vocabulary;
import joshua.decoder.BLEU;
import joshua.decoder.JoshuaConfiguration;
+import joshua.decoder.StructuredTranslation;
+import joshua.decoder.StructuredTranslationFactory;
import joshua.decoder.ff.FeatureFunction;
import joshua.decoder.ff.FeatureVector;
import joshua.decoder.ff.fragmentlm.Tree;
@@ -153,11 +156,49 @@ public class KBestExtractor {
* @return the derivation object
*/
public DerivationState getKthDerivation(HGNode node, int k) {
- VirtualNode virtualNode = getVirtualNode(node);
+ final VirtualNode virtualNode = getVirtualNode(node);
return virtualNode.lazyKBestExtractOnNode(this, k);
}
/**
+ * Returns the k-th Structured Translation.
+ */
+ public StructuredTranslation getKthStructuredTranslation(HGNode node, int k) {
+ StructuredTranslation result = null;
+ final DerivationState derivationState = getKthDerivation(node, k);
+ if (derivationState != null) {
+ result = StructuredTranslationFactory.fromKBestDerivation(sentence, derivationState);
+ }
+ return result;
+ }
+
+ /**
+ * This is an entry point for extracting k-best hypotheses as StructuredTranslation objects.
+ * It computes all of them and returning a list of StructuredTranslation objects.
+ * These objects hold all translation information (string, tokens, features, alignments, score).
+ *
+ * @param hg the hypergraph to extract from
+ * @param topN how many to extract
+ * @param out object to write to
+ * @return list of StructuredTranslation objects, empty if there is no HyperGraph goal node.
+ */
+ public List<StructuredTranslation> KbestExtractOnHG(HyperGraph hg, int topN) {
+ resetState();
+ if (hg.goalNode == null) {
+ return emptyList();
+ }
+ final List<StructuredTranslation> kbest = new ArrayList<>(topN);
+ for (int k = 1; k <= topN; k++) {
+ StructuredTranslation translation = getKthStructuredTranslation(hg.goalNode, k);
+ if (translation == null) {
+ break;
+ }
+ kbest.add(translation);
+ }
+ return kbest;
+ }
+
+ /**
* Compute the string that is output from the decoder, using the "output-format" config file
* parameter as a template.
*
@@ -166,11 +207,7 @@ public class KBestExtractor {
public String getKthHyp(HGNode node, int k) {
String outputString = null;
-
- // Determine the k-best hypotheses at each HGNode
- VirtualNode virtualNode = getVirtualNode(node);
- DerivationState derivationState = virtualNode.lazyKBestExtractOnNode(this, k);
-// DerivationState derivationState = getKthDerivation(node, k);
+ DerivationState derivationState = getKthDerivation(node, k);
if (derivationState != null) {
// ==== read the kbest from each hgnode and convert to output format
String hypothesis = maybeProjectCase(
@@ -213,7 +250,7 @@ public class KBestExtractor {
/* %a causes output of word level alignments between input and output hypothesis */
if (outputFormat.contains("%a")) {
- outputString = outputString.replace("%a", derivationState.getWordAlignmentString());
+ outputString = outputString.replace("%a", derivationState.getWordAlignment());
}
}
@@ -236,7 +273,7 @@ public class KBestExtractor {
if (joshuaConfiguration.project_case) {
String[] tokens = hypothesis.split("\\s+");
- List<List<Integer>> points = state.getWordAlignment();
+ List<List<Integer>> points = state.getWordAlignmentList();
for (int i = 0; i < points.size(); i++) {
List<Integer> target = points.get(i);
for (int source: target) {
@@ -763,42 +800,36 @@ public class KBestExtractor {
return visitor;
}
-
- private String getWordAlignmentString() {
+
+ public String getWordAlignment() {
return visit(new WordAlignmentExtractor()).toString();
}
- private List<List<Integer>> getWordAlignment() {
- WordAlignmentExtractor extractor = new WordAlignmentExtractor();
- visit(extractor);
- return extractor.getFinalWordAlignments();
+ public List<List<Integer>> getWordAlignmentList() {
+ final WordAlignmentExtractor visitor = new WordAlignmentExtractor();
+ visit(visitor);
+ return visitor.getFinalWordAlignments();
}
- private String getTree() {
+ public String getTree() {
return visit(new TreeExtractor()).toString();
}
- private String getHypothesis() {
+ public String getHypothesis() {
return getHypothesis(defaultSide);
}
- /**
- * For stack decoding we keep using the old string-based
- * HypothesisExtractor.
- * For Hiero, we use a faster, int-based hypothesis extraction
- * that is correct also for Side.SOURCE cases.
- */
private String getHypothesis(final Side side) {
return visit(new OutputStringExtractor(side.equals(Side.SOURCE))).toString();
}
- private FeatureVector getFeatures() {
+ public FeatureVector getFeatures() {
final FeatureVectorExtractor extractor = new FeatureVectorExtractor(featureFunctions, sentence);
visit(extractor);
return extractor.getFeatures();
}
- private String getDerivation() {
+ public String getDerivation() {
return visit(new DerivationExtractor()).toString();
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e3673e98/src/joshua/decoder/hypergraph/WordAlignmentState.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/WordAlignmentState.java b/src/joshua/decoder/hypergraph/WordAlignmentState.java
index 258e062..3430c5d 100644
--- a/src/joshua/decoder/hypergraph/WordAlignmentState.java
+++ b/src/joshua/decoder/hypergraph/WordAlignmentState.java
@@ -18,6 +18,8 @@
*/
package joshua.decoder.hypergraph;
+import static java.lang.Integer.MAX_VALUE;
+
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
@@ -40,8 +42,8 @@ public class WordAlignmentState {
* rule. The values of the elements correspond to the aligned source token on
* the source side of the rule.
*/
- private LinkedList<AlignedSourceTokens> trgPoints;
- private int srcStart;
+ private List<AlignedSourceTokens> trgPoints;
+ private final int srcStart;
/** number of NTs we need to substitute. */
private int numNT;
/** grows with substitutions of child rules. Reaches original Rule span if substitutions are complete */
@@ -51,17 +53,17 @@ public class WordAlignmentState {
* construct AlignmentState object from a virgin Rule and its source span.
* Determines if state is complete (if no NT present)
*/
- WordAlignmentState(Rule rule, int start) {
+ public WordAlignmentState(final Rule rule, final int start) {
trgPoints = new LinkedList<AlignedSourceTokens>();
srcLength = rule.getFrench().length;
numNT = rule.getArity();
srcStart = start;
- Map<Integer, List<Integer>> alignmentMap = rule.getAlignmentMap();
- int[] nonTermPositions = rule.getNonTerminalSourcePositions();
- int[] trg = rule.getEnglish();
+ final Map<Integer, List<Integer>> alignmentMap = rule.getAlignmentMap();
+ final int[] nonTerminalSourcePositions = rule.getNonTerminalSourcePositions();
+ final int[] trg = rule.getEnglish();
// for each target index, create a TargetAlignmentPoint
for (int trgIndex = 0; trgIndex < trg.length; trgIndex++) {
- AlignedSourceTokens trgPoint = new AlignedSourceTokens();
+ final AlignedSourceTokens trgPoint = new AlignedSourceTokens();
if (trg[trgIndex] >= 0) { // this is a terminal symbol, check for alignment
if (alignmentMap.containsKey(trgIndex)) {
@@ -72,9 +74,10 @@ public class WordAlignmentState {
} else { // this target word is NULL-aligned
trgPoint.setNull();
}
- } else { // this is a nonterminal ([X]) [actually its the (negative) index of the NT in the source
- trgPoint.setNonTerminal();
- trgPoint.add(srcStart + nonTermPositions[Math.abs(trg[trgIndex]) - 1]);
+ } else { // this is a nonterminal ([X]) [actually its the (negative) index of the NT in the source]
+ trgPoint.setNonTerminal(); // mark as non-terminal
+ final int absoluteNonTerminalSourcePosition = srcStart + nonTerminalSourcePositions[Math.abs(trg[trgIndex]) - 1];
+ trgPoint.add(absoluteNonTerminalSourcePosition);
}
trgPoints.add(trgPoint);
}
@@ -93,17 +96,18 @@ public class WordAlignmentState {
* trg. Sorted by trg indexes. Disregards the sentence markers.
*/
public String toFinalString() {
- StringBuilder sb = new StringBuilder();
+ final StringBuilder sb = new StringBuilder();
int t = 0;
for (AlignedSourceTokens pt : trgPoints) {
- for (int s : pt)
- sb.append(String.format(" %d-%d", s-1, t-1)); // disregard sentence
- // markers
+ for (int s : pt) {
+ sb.append(String.format(" %d-%d", s-1, t-1)); // disregard sentence markers
+ }
t++;
}
- String result = sb.toString();
- if (!result.isEmpty())
+ final String result = sb.toString();
+ if (!result.isEmpty()) {
return result.substring(1);
+ }
return result;
}
@@ -113,18 +117,19 @@ public class WordAlignmentState {
* First and last item in trgPoints is skipped.
*/
public List<List<Integer>> toFinalList() {
- assert (isComplete() == true);
- List<List<Integer>> alignment = new ArrayList<List<Integer>> ();
- if (trgPoints.isEmpty())
+ final List<List<Integer>> alignment = new ArrayList<List<Integer>>(trgPoints.size());
+ if (trgPoints.isEmpty()) {
return alignment;
- ListIterator<AlignedSourceTokens> it = trgPoints.listIterator();
+ }
+ final ListIterator<AlignedSourceTokens> it = trgPoints.listIterator();
it.next(); // skip first item (sentence marker)
while (it.hasNext()) {
- AlignedSourceTokens alignedSourceTokens = it.next();
+ final AlignedSourceTokens alignedSourceTokens = it.next();
if (it.hasNext()) { // if not last element in trgPoints
- List<Integer> newAlignedSourceTokens = new ArrayList<Integer>();
- for (Integer sourceIndex : alignedSourceTokens)
+ final List<Integer> newAlignedSourceTokens = new ArrayList<Integer>();
+ for (Integer sourceIndex : alignedSourceTokens) {
newAlignedSourceTokens.add(sourceIndex - 1); // shift by one to disregard sentence marker
+ }
alignment.add(newAlignedSourceTokens);
}
}
@@ -134,38 +139,46 @@ public class WordAlignmentState {
/**
* String representation for debugging.
*/
+ @Override
public String toString() {
return String.format("%s , len=%d start=%d, isComplete=%s",
trgPoints.toString(), srcLength, srcStart, this.isComplete());
}
/**
- * substitutes a child WorldAlignmentState into this instance at the first
- * NT it finds. Also shifts the indeces in this instance by the span/width of the
+ * Substitutes a child WorldAlignmentState into this instance at the next
+ * nonterminal slot. Also shifts the indeces in this instance by the span/width of the
* child that is to be substituted.
* Substitution order is determined by the source-first traversal through the hypergraph.
*/
- void substituteIn(WordAlignmentState child) {
- // update existing indexes by length of child (has no effect on NULL and
- // NonTerminal points)
- for (AlignedSourceTokens trgPoint : trgPoints)
+ public void substituteIn(WordAlignmentState child) {
+ // find the index of the NonTerminal where we substitute the child targetPoints into.
+ // The correct NT is the first one on the SOURCE side.
+ // Also shift all trgPoints by the child length.
+ int substitutionIndex = 0;
+ int sourcePosition = MAX_VALUE;
+ for (final ListIterator<AlignedSourceTokens> trgPointsIterator = trgPoints.listIterator(); trgPointsIterator.hasNext();) {
+ final AlignedSourceTokens trgPoint = trgPointsIterator.next();
trgPoint.shiftBy(child.srcStart, child.srcLength - 1);
-
- // now substitute in the child at first NT, modifying the list
- ListIterator<AlignedSourceTokens> it = trgPoints.listIterator();
- while (it.hasNext()) {
- AlignedSourceTokens trgPoint = it.next();
- if (trgPoint.isNonTerminal()) { // found first NT
- it.remove(); // remove NT symbol
- for (AlignedSourceTokens childElement : child.trgPoints) {
- childElement.setFinal(); // child source indexes are final, do not change them anymore
- it.add(childElement);
- }
- this.srcLength += child.srcLength - 1; // -1 (NT)
- this.numNT--;
- break;
+ if (trgPoint.isNonTerminal() && trgPoint.get(0) < sourcePosition) {
+ sourcePosition = trgPoint.get(0);
+ substitutionIndex = trgPointsIterator.previousIndex();
}
}
+
+ // point and remove NT element determined from above
+ final ListIterator<AlignedSourceTokens> insertionIterator = trgPoints.listIterator(substitutionIndex);
+ insertionIterator.next();
+ insertionIterator.remove();
+
+ // insert child target points and set them to final.
+ for (AlignedSourceTokens childElement : child.trgPoints) {
+ childElement.setFinal();
+ insertionIterator.add(childElement);
+ }
+
+ // update length and number of non terminal slots
+ this.srcLength += child.srcLength - 1; // -1 (NT)
+ this.numNT--;
}
-
-}
\ No newline at end of file
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e3673e98/src/joshua/decoder/io/JSONMessage.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/io/JSONMessage.java b/src/joshua/decoder/io/JSONMessage.java
index 2733db4..e373716 100644
--- a/src/joshua/decoder/io/JSONMessage.java
+++ b/src/joshua/decoder/io/JSONMessage.java
@@ -90,7 +90,7 @@ public class JSONMessage {
JSONMessage message = new JSONMessage();
String[] results = translation.toString().split("\\n");
if (results.length > 0) {
- JSONMessage.TranslationItem item = message.addTranslation(translation.getStructuredTranslation().getTranslationString());
+ JSONMessage.TranslationItem item = message.addTranslation(translation.getStructuredTranslations().get(0).getTranslationString());
for (String result: results) {
String[] tokens = result.split(" \\|\\|\\| ");
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e3673e98/tst/joshua/corpus/VocabularyTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/corpus/VocabularyTest.java b/tst/joshua/corpus/VocabularyTest.java
index 724d9c7..107f76f 100644
--- a/tst/joshua/corpus/VocabularyTest.java
+++ b/tst/joshua/corpus/VocabularyTest.java
@@ -1,11 +1,13 @@
// Copyright 2015 Amazon.com, Inc. or its affiliates. All Rights Reserved.
package joshua.corpus;
-import static org.junit.Assert.*;
+import static joshua.util.FormatUtils.isNonterminal;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.IOException;
-import java.util.Arrays;
import org.junit.After;
import org.junit.Before;
@@ -53,21 +55,21 @@ public class VocabularyTest {
@Test
public void givenVocabulary_whenCheckingStringInBracketsOrNegativeNumber_thenIsNonTerminal() {
//non-terminals
- assertTrue(Vocabulary.nt(NON_TERMINAL));
+ assertTrue(isNonterminal(NON_TERMINAL));
//terminals
- assertFalse(Vocabulary.nt(WORD1));
- assertFalse(Vocabulary.nt("[]"));
- assertFalse(Vocabulary.nt("["));
- assertFalse(Vocabulary.nt("]"));
- assertFalse(Vocabulary.nt(""));
+ assertFalse(isNonterminal(WORD1));
+ assertFalse(isNonterminal("[]"));
+ assertFalse(isNonterminal("["));
+ assertFalse(isNonterminal("]"));
+ assertFalse(isNonterminal(""));
//negative numbers indicate non-terminals
- assertTrue(Vocabulary.nt(-1));
- assertTrue(Vocabulary.nt(-5));
+ assertTrue(isNonterminal(-1));
+ assertTrue(isNonterminal(-5));
//positive numbers indicate terminals:
- assertFalse(Vocabulary.nt(0));
- assertFalse(Vocabulary.nt(5));
+ assertFalse(isNonterminal(0));
+ assertFalse(isNonterminal(5));
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e3673e98/tst/joshua/decoder/kbest_extraction/KBestExtractionTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/decoder/kbest_extraction/KBestExtractionTest.java b/tst/joshua/decoder/kbest_extraction/KBestExtractionTest.java
index 26c503a..36b0bd3 100644
--- a/tst/joshua/decoder/kbest_extraction/KBestExtractionTest.java
+++ b/tst/joshua/decoder/kbest_extraction/KBestExtractionTest.java
@@ -18,11 +18,14 @@
*/
package joshua.decoder.kbest_extraction;
+import static com.google.common.base.Charsets.UTF_8;
+import static java.nio.file.Files.readAllBytes;
+import static org.junit.Assert.assertEquals;
+
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
-import joshua.corpus.Vocabulary;
import joshua.decoder.Decoder;
import joshua.decoder.JoshuaConfiguration;
import joshua.decoder.Translation;
@@ -32,11 +35,6 @@ import org.junit.After;
import org.junit.Before;
import org.junit.Test;
-import static com.google.common.base.Charsets.UTF_8;
-import static java.nio.file.Files.readAllBytes;
-import static joshua.decoder.ff.FeatureVector.DENSE_FEATURE_NAMES;
-import static org.junit.Assert.assertEquals;
-
/**
* Reimplements the kbest extraction regression test
* TODO (fhieber): this test strangely only works with StateMinimizing KenLM.
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e3673e98/tst/joshua/decoder/phrase/constrained/ConstrainedPhraseDecodingTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/decoder/phrase/constrained/ConstrainedPhraseDecodingTest.java b/tst/joshua/decoder/phrase/constrained/ConstrainedPhraseDecodingTest.java
index 6abfbe2..14a87be 100644
--- a/tst/joshua/decoder/phrase/constrained/ConstrainedPhraseDecodingTest.java
+++ b/tst/joshua/decoder/phrase/constrained/ConstrainedPhraseDecodingTest.java
@@ -18,11 +18,14 @@
*/
package joshua.decoder.phrase.constrained;
+import static com.google.common.base.Charsets.UTF_8;
+import static java.nio.file.Files.readAllBytes;
+import static org.junit.Assert.assertEquals;
+
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
-import joshua.corpus.Vocabulary;
import joshua.decoder.Decoder;
import joshua.decoder.JoshuaConfiguration;
import joshua.decoder.Translation;
@@ -32,11 +35,6 @@ import org.junit.After;
import org.junit.Before;
import org.junit.Test;
-import static com.google.common.base.Charsets.UTF_8;
-import static java.nio.file.Files.readAllBytes;
-import static joshua.decoder.ff.FeatureVector.DENSE_FEATURE_NAMES;
-import static org.junit.Assert.assertEquals;
-
/**
* Reimplements the constrained phrase decoding test
*/
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e3673e98/tst/joshua/decoder/phrase/decode/PhraseDecodingTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/decoder/phrase/decode/PhraseDecodingTest.java b/tst/joshua/decoder/phrase/decode/PhraseDecodingTest.java
index 4785aff..621d80b 100644
--- a/tst/joshua/decoder/phrase/decode/PhraseDecodingTest.java
+++ b/tst/joshua/decoder/phrase/decode/PhraseDecodingTest.java
@@ -18,11 +18,14 @@
*/
package joshua.decoder.phrase.decode;
+import static com.google.common.base.Charsets.UTF_8;
+import static java.nio.file.Files.readAllBytes;
+import static org.junit.Assert.assertEquals;
+
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
-import joshua.corpus.Vocabulary;
import joshua.decoder.Decoder;
import joshua.decoder.JoshuaConfiguration;
import joshua.decoder.Translation;
@@ -32,11 +35,6 @@ import org.junit.After;
import org.junit.Before;
import org.junit.Test;
-import static com.google.common.base.Charsets.UTF_8;
-import static java.nio.file.Files.readAllBytes;
-import static joshua.decoder.ff.FeatureVector.DENSE_FEATURE_NAMES;
-import static org.junit.Assert.assertEquals;
-
/**
* Reimplements the constrained phrase decoding test
*/
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e3673e98/tst/joshua/system/KenLmTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/system/KenLmTest.java b/tst/joshua/system/KenLmTest.java
index dba74fc..5529fa7 100644
--- a/tst/joshua/system/KenLmTest.java
+++ b/tst/joshua/system/KenLmTest.java
@@ -20,10 +20,10 @@
import static joshua.corpus.Vocabulary.registerLanguageModel;
import static joshua.corpus.Vocabulary.unregisterLanguageModels;
-import static org.junit.Assert.*;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
import joshua.corpus.Vocabulary;
-import joshua.decoder.Decoder;
-import joshua.decoder.JoshuaConfiguration;
import joshua.decoder.ff.lm.KenLM;
import org.junit.After;
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e3673e98/tst/joshua/system/MultithreadedTranslationTests.java
----------------------------------------------------------------------
diff --git a/tst/joshua/system/MultithreadedTranslationTests.java b/tst/joshua/system/MultithreadedTranslationTests.java
index b257aa6..f438ccd 100644
--- a/tst/joshua/system/MultithreadedTranslationTests.java
+++ b/tst/joshua/system/MultithreadedTranslationTests.java
@@ -20,7 +20,9 @@
import static org.junit.Assert.assertTrue;
+import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
+import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
@@ -29,7 +31,7 @@ import joshua.decoder.Decoder;
import joshua.decoder.JoshuaConfiguration;
import joshua.decoder.Translation;
import joshua.decoder.Translations;
-import joshua.decoder.io.TranslationRequest;
+import joshua.decoder.io.TranslationRequestStream;
import org.junit.After;
import org.junit.Before;
@@ -108,7 +110,7 @@ public class MultithreadedTranslationTests {
// GIVEN
int inputLines = 10000;
- joshuaConfig.construct_structured_output = true; // Enabled alignments.
+ joshuaConfig.use_structured_output = true; // Enabled alignments.
StringBuilder sb = new StringBuilder();
for (int i = 0; i < inputLines; i++) {
sb.append(INPUT + "\n");
@@ -116,8 +118,8 @@ public class MultithreadedTranslationTests {
// Append a large string together to simulate N requests to the decoding
// engine.
- TranslationRequest req = new TranslationRequest(new ByteArrayInputStream(sb.toString()
- .getBytes(Charset.forName("UTF-8"))), joshuaConfig);
+ TranslationRequestStream req = new TranslationRequestStream(new BufferedReader(new InputStreamReader(new ByteArrayInputStream(sb.toString()
+ .getBytes(Charset.forName("UTF-8"))))), joshuaConfig);
// WHEN
// Translate all spans in parallel.
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e3673e98/tst/joshua/system/StructuredOutputTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/system/StructuredOutputTest.java b/tst/joshua/system/StructuredOutputTest.java
index 12e6e88..a1cdd82 100644
--- a/tst/joshua/system/StructuredOutputTest.java
+++ b/tst/joshua/system/StructuredOutputTest.java
@@ -63,8 +63,8 @@ public class StructuredOutputTest {
joshuaConfig.use_unique_nbest = false;
joshuaConfig.include_align_index = false;
joshuaConfig.topN = 0;
- joshuaConfig.tms.add("thrax pt 20 resources/wa_grammar");
- joshuaConfig.tms.add("thrax glue -1 resources/grammar.glue");
+ joshuaConfig.tms.add("thrax -owner pt -maxspan 20 -path resources/wa_grammar");
+ joshuaConfig.tms.add("thrax -owner glue -maxspan -1 -path resources/grammar.glue");
joshuaConfig.goal_symbol = "[GOAL]";
joshuaConfig.default_non_terminal = "[X]";
joshuaConfig.features.add("feature_function = OOVPenalty");
@@ -107,14 +107,14 @@ public class StructuredOutputTest {
joshuaConfig.use_structured_output = true; // set structured output creation to true
translation = decode(input);
Assert
- .assertEquals(expectedTranslation, translation.getTranslationString());
+ .assertEquals(expectedTranslation, translation.getStructuredTranslations().get(0).getTranslationString());
Assert.assertEquals(Arrays.asList(expectedTranslation.split("\\s+")),
- translation.getTranslationTokens());
- Assert.assertEquals(expectedScore, translation.getTranslationScore(),
+ translation.getStructuredTranslations().get(0).getTranslationTokens());
+ Assert.assertEquals(expectedScore, translation.getStructuredTranslations().get(0).getTranslationScore(),
0.00001);
- Assert.assertEquals(expectedWordAlignment, translation.getWordAlignment());
- Assert.assertEquals(translation.getWordAlignment().size(), translation
- .getTranslationTokens().size());
+ Assert.assertEquals(expectedWordAlignment, translation.getStructuredTranslations().get(0).getTranslationWordAlignments());
+ Assert.assertEquals(translation.getStructuredTranslations().get(0).getTranslationWordAlignments().size(), translation
+ .getStructuredTranslations().get(0).getTranslationTokens().size());
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e3673e98/tst/joshua/system/StructuredTranslationTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/system/StructuredTranslationTest.java b/tst/joshua/system/StructuredTranslationTest.java
index 7460614..0608a65 100644
--- a/tst/joshua/system/StructuredTranslationTest.java
+++ b/tst/joshua/system/StructuredTranslationTest.java
@@ -19,7 +19,6 @@
package joshua.system;
import static java.util.Arrays.asList;
-import static joshua.decoder.ff.FeatureVector.DENSE_FEATURE_NAMES;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
@@ -27,12 +26,10 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
-import joshua.corpus.Vocabulary;
import joshua.decoder.Decoder;
import joshua.decoder.JoshuaConfiguration;
import joshua.decoder.StructuredTranslation;
import joshua.decoder.Translation;
-import joshua.decoder.ff.FeatureVector;
import joshua.decoder.segment_file.Sentence;
import org.junit.After;
@@ -63,6 +60,7 @@ public class StructuredTranslationTest {
asList(), asList(7));
private static final double EXPECTED_SCORE = -17.0;
private static final Map<String,Float> EXPECTED_FEATURES = new HashMap<>();
+ private static final int EXPECTED_NBEST_LIST_SIZE = 8;
static {
EXPECTED_FEATURES.put("tm_glue_0", 1.0f);
EXPECTED_FEATURES.put("tm_pt_0", -3.0f);
@@ -115,7 +113,7 @@ public class StructuredTranslationTest {
@Test
public void givenInput_whenRegularOutputFormat_thenExpectedOutput() {
// GIVEN
- joshuaConfig.construct_structured_output = false;
+ joshuaConfig.use_structured_output = false;
joshuaConfig.outputFormat = "%s | %a ";
// WHEN
@@ -128,7 +126,7 @@ public class StructuredTranslationTest {
@Test
public void givenInput_whenRegularOutputFormatWithTopN1_thenExpectedOutput() {
// GIVEN
- joshuaConfig.construct_structured_output = false;
+ joshuaConfig.use_structured_output = false;
joshuaConfig.outputFormat = "%s | %e | %a | %c";
joshuaConfig.topN = 1;
@@ -141,19 +139,48 @@ public class StructuredTranslationTest {
}
@Test
- public void givenInput_whenStructuredOutputFormat_thenExpectedOutput() {
+ public void givenInput_whenStructuredOutputFormatWithTopN0_thenExpectedOutput() {
// GIVEN
- joshuaConfig.construct_structured_output = true;
+ joshuaConfig.use_structured_output = true;
+ joshuaConfig.topN = 0;
+
+ // WHEN
+ final Translation translation = decode(INPUT);
+ final StructuredTranslation structuredTranslation = translation.getStructuredTranslations().get(0);
+ final String translationString = structuredTranslation.getTranslationString();
+ final List<String> translatedTokens = structuredTranslation.getTranslationTokens();
+ final float translationScore = structuredTranslation.getTranslationScore();
+ final List<List<Integer>> wordAlignment = structuredTranslation.getTranslationWordAlignments();
+ final Map<String,Float> translationFeatures = structuredTranslation.getTranslationFeatures();
+
+ // THEN
+ assertTrue(translation.getStructuredTranslations().size() == 1);
+ assertEquals(EXPECTED_TRANSLATION, translationString);
+ assertEquals(EXPECTED_TRANSLATED_TOKENS, translatedTokens);
+ assertEquals(EXPECTED_SCORE, translationScore, 0.00001);
+ assertEquals(EXPECTED_WORD_ALIGNMENT, wordAlignment);
+ assertEquals(wordAlignment.size(), translatedTokens.size());
+ assertEquals(EXPECTED_FEATURES.entrySet(), translationFeatures.entrySet());
+ }
+
+ @Test
+ public void givenInput_whenStructuredOutputFormatWithTopN1_thenExpectedOutput() {
+ // GIVEN
+ joshuaConfig.use_structured_output = true;
+ joshuaConfig.topN = 1;
// WHEN
- final StructuredTranslation translation = decode(INPUT).getStructuredTranslation();
- final String translationString = translation.getTranslationString();
- final List<String> translatedTokens = translation.getTranslationTokens();
- final float translationScore = translation.getTranslationScore();
- final List<List<Integer>> wordAlignment = translation.getTranslationWordAlignments();
- final Map<String,Float> translationFeatures = translation.getTranslationFeatures();
+ final Translation translation = decode(INPUT);
+ final List<StructuredTranslation> structuredTranslations = translation.getStructuredTranslations();
+ final StructuredTranslation structuredTranslation = structuredTranslations.get(0);
+ final String translationString = structuredTranslation.getTranslationString();
+ final List<String> translatedTokens = structuredTranslation.getTranslationTokens();
+ final float translationScore = structuredTranslation.getTranslationScore();
+ final List<List<Integer>> wordAlignment = structuredTranslation.getTranslationWordAlignments();
+ final Map<String,Float> translationFeatures = structuredTranslation.getTranslationFeatures();
// THEN
+ assertTrue(structuredTranslations.size() == 1);
assertEquals(EXPECTED_TRANSLATION, translationString);
assertEquals(EXPECTED_TRANSLATED_TOKENS, translatedTokens);
assertEquals(EXPECTED_SCORE, translationScore, 0.00001);
@@ -163,16 +190,43 @@ public class StructuredTranslationTest {
}
@Test
+ public void givenInput_whenStructuredOutputFormatWithKBest_thenExpectedOutput() {
+ // GIVEN
+ joshuaConfig.use_structured_output = true;
+ joshuaConfig.topN = 100;
+
+ // WHEN
+ final Translation translation = decode(INPUT);
+ final List<StructuredTranslation> structuredTranslations = translation.getStructuredTranslations();
+ final StructuredTranslation viterbiTranslation = structuredTranslations.get(0);
+ final StructuredTranslation lastKBest = structuredTranslations.get(structuredTranslations.size() - 1);
+
+ // THEN
+ assertEquals(structuredTranslations.size(), EXPECTED_NBEST_LIST_SIZE);
+ assertTrue(structuredTranslations.size() > 1);
+ assertEquals(EXPECTED_TRANSLATION, viterbiTranslation.getTranslationString());
+ assertEquals(EXPECTED_TRANSLATED_TOKENS, viterbiTranslation.getTranslationTokens());
+ assertEquals(EXPECTED_SCORE, viterbiTranslation.getTranslationScore(), 0.00001);
+ assertEquals(EXPECTED_WORD_ALIGNMENT, viterbiTranslation.getTranslationWordAlignments());
+ assertEquals(EXPECTED_FEATURES.entrySet(), viterbiTranslation.getTranslationFeatures().entrySet());
+ // last entry in KBEST is all input words untranslated, should have 8 OOVs.
+ assertEquals(INPUT, lastKBest.getTranslationString());
+ assertEquals(-800.0, lastKBest.getTranslationFeatures().get("OOVPenalty"), 0.0001);
+
+ }
+
+ @Test
public void givenEmptyInput_whenStructuredOutputFormat_thenEmptyOutput() {
// GIVEN
- joshuaConfig.construct_structured_output = true;
+ joshuaConfig.use_structured_output = true;
// WHEN
- final StructuredTranslation translation = decode("").getStructuredTranslation();
- final String translationString = translation.getTranslationString();
- final List<String> translatedTokens = translation.getTranslationTokens();
- final float translationScore = translation.getTranslationScore();
- final List<List<Integer>> wordAlignment = translation.getTranslationWordAlignments();
+ final Translation translation = decode("");
+ final StructuredTranslation structuredTranslation = translation.getStructuredTranslations().get(0);
+ final String translationString = structuredTranslation.getTranslationString();
+ final List<String> translatedTokens = structuredTranslation.getTranslationTokens();
+ final float translationScore = structuredTranslation.getTranslationScore();
+ final List<List<Integer>> wordAlignment = structuredTranslation.getTranslationWordAlignments();
// THEN
assertEquals("", translationString);
@@ -184,15 +238,16 @@ public class StructuredTranslationTest {
@Test
public void givenOOVInput_whenStructuredOutputFormat_thenOOVOutput() {
// GIVEN
- joshuaConfig.construct_structured_output = true;
+ joshuaConfig.use_structured_output = true;
final String input = "gabarbl";
// WHEN
- final StructuredTranslation translation = decode(input).getStructuredTranslation();
- final String translationString = translation.getTranslationString();
- final List<String> translatedTokens = translation.getTranslationTokens();
- final float translationScore = translation.getTranslationScore();
- final List<List<Integer>> wordAlignment = translation.getTranslationWordAlignments();
+ final Translation translation = decode(input);
+ final StructuredTranslation structuredTranslation = translation.getStructuredTranslations().get(0);
+ final String translationString = structuredTranslation.getTranslationString();
+ final List<String> translatedTokens = structuredTranslation.getTranslationTokens();
+ final float translationScore = structuredTranslation.getTranslationScore();
+ final List<List<Integer>> wordAlignment = structuredTranslation.getTranslationWordAlignments();
// THEN
assertEquals(input, translationString);
@@ -204,7 +259,7 @@ public class StructuredTranslationTest {
@Test
public void givenEmptyInput_whenRegularOutputFormat_thenNewlineOutput() {
// GIVEN
- joshuaConfig.construct_structured_output = false;
+ joshuaConfig.use_structured_output = false;
// WHEN
final Translation translation = decode("");
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e3673e98/tst/joshua/util/FormatUtilsTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/util/FormatUtilsTest.java b/tst/joshua/util/FormatUtilsTest.java
index 254522d..a1edc33 100644
--- a/tst/joshua/util/FormatUtilsTest.java
+++ b/tst/joshua/util/FormatUtilsTest.java
@@ -19,9 +19,9 @@
package joshua.util;
import static joshua.util.FormatUtils.cleanNonTerminal;
+import static joshua.util.FormatUtils.ensureNonTerminalBrackets;
import static joshua.util.FormatUtils.escapeSpecialSymbols;
import static joshua.util.FormatUtils.isNonterminal;
-import static joshua.util.FormatUtils.markup;
import static joshua.util.FormatUtils.stripNonTerminalIndex;
import static joshua.util.FormatUtils.unescapeSpecialSymbols;
import static org.junit.Assert.*;
@@ -58,11 +58,7 @@ public class FormatUtilsTest {
@Test
public void givenTokens_whenMarkup_thenCorrectMarkup() {
- assertEquals(markup("X"), "[X]");
- assertEquals(markup("X", 1), "[X,1]");
- assertEquals(markup("X", 15), "[X,15]");
- assertEquals(markup("[X]", 1), "[X,1]");
- assertEquals(markup("[X,1]", 4), "[X,4]");
+ assertEquals(ensureNonTerminalBrackets("X"), "[X]");
}
@Test