You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/29 06:47:11 UTC
[09/10] incubator-joshua git commit: removed non-functioning
include_align_index code, added test case in case it's restored
removed non-functioning include_align_index code, added test case in case it's restored
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/a6bf6283
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/a6bf6283
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/a6bf6283
Branch: refs/heads/master
Commit: a6bf6283aedfd4b3538bc36cd9264c9b610281ff
Parents: 73523c5
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Apr 29 00:45:09 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Apr 29 00:45:09 2016 -0400
----------------------------------------------------------------------
.../decoder/hypergraph/KBestExtractor.java | 25 +++-------
test/decoder/phrase/include-align-index/README | 2 +
test/decoder/phrase/include-align-index/config | 29 +++++++++++
.../phrase/include-align-index/corpus.es | 1 +
test/decoder/phrase/include-align-index/lm.1.gz | Bin 0 -> 2235 bytes
test/decoder/phrase/include-align-index/log | 50 +++++++++++++++++++
test/decoder/phrase/include-align-index/output | 1 +
.../phrase/include-align-index/output.gold | 1 +
.../phrase/include-align-index/rules.1.gz | Bin 0 -> 2998042 bytes
test/decoder/phrase/include-align-index/test.sh | 17 +++++++
10 files changed, 108 insertions(+), 18 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a6bf6283/src/joshua/decoder/hypergraph/KBestExtractor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/KBestExtractor.java b/src/joshua/decoder/hypergraph/KBestExtractor.java
index a41ee66..98f6f15 100644
--- a/src/joshua/decoder/hypergraph/KBestExtractor.java
+++ b/src/joshua/decoder/hypergraph/KBestExtractor.java
@@ -166,18 +166,19 @@ public class KBestExtractor {
public String getKthHyp(HGNode node, int k) {
String outputString = null;
-
+
// Determine the k-best hypotheses at each HGNode
VirtualNode virtualNode = getVirtualNode(node);
DerivationState derivationState = virtualNode.lazyKBestExtractOnNode(this, k);
// DerivationState derivationState = getKthDerivation(node, k);
if (derivationState != null) {
// ==== read the kbest from each hgnode and convert to output format
- String hypothesis = formatForOutput(
+ String hypothesis = maybeProjectCase(
unescapeSpecialSymbols(
removeSentenceMarkers(
derivationState.getHypothesis())), derivationState);
+
/*
* To save space, the decoder only stores the model cost,
* no the individual feature values.
@@ -190,8 +191,8 @@ public class KBestExtractor {
outputString = outputFormat
.replace("%k", Integer.toString(k))
- .replace("%s", formatForOutput(hypothesis, derivationState))
- .replace("%S", DeNormalize.processSingleLine(formatForOutput(hypothesis, derivationState)))
+ .replace("%s", hypothesis)
+ .replace("%S", DeNormalize.processSingleLine(hypothesis))
// TODO (kellens): Fix the recapitalization here
.replace("%i", Integer.toString(sentence.id()))
.replace("%f", joshuaConfiguration.moses ? features.mosesString() : features.toString())
@@ -230,9 +231,9 @@ public class KBestExtractor {
* @param state
* @return
*/
- private String formatForOutput(String hypothesis, DerivationState state) {
+ private String maybeProjectCase(String hypothesis, DerivationState state) {
String output = hypothesis;
-
+
if (joshuaConfiguration.project_case) {
String[] tokens = hypothesis.split("\\s+");
List<List<Integer>> points = state.getWordAlignment();
@@ -253,18 +254,6 @@ public class KBestExtractor {
output = String.join(" ", tokens);
}
- if (joshuaConfiguration.include_align_index) {
- String[] tokens = hypothesis.split("\\s+");
- List<List<Integer>> points = state.getWordAlignment();
- for (int i = 0; i < tokens.length; i++) {
- if (i < points.size()) {
- tokens[i] += String.format(" %d-%d", points.get(i).get(0),
- points.get(i).get(points.get(i).size()-1));
- }
- }
- output = String.join(" ", tokens);
- }
-
return output;
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a6bf6283/test/decoder/phrase/include-align-index/README
----------------------------------------------------------------------
diff --git a/test/decoder/phrase/include-align-index/README b/test/decoder/phrase/include-align-index/README
new file mode 100644
index 0000000..d0c0813
--- /dev/null
+++ b/test/decoder/phrase/include-align-index/README
@@ -0,0 +1,2 @@
+Added non-functioning example that will test outputting phrase alignments if
+that ability is ever restored.
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a6bf6283/test/decoder/phrase/include-align-index/config
----------------------------------------------------------------------
diff --git a/test/decoder/phrase/include-align-index/config b/test/decoder/phrase/include-align-index/config
new file mode 100644
index 0000000..f30014d
--- /dev/null
+++ b/test/decoder/phrase/include-align-index/config
@@ -0,0 +1,29 @@
+tm = moses -owner pt -maxspan 0 -path rules.1.gz -max-source-len 5
+feature-function = StateMinimizingLanguageModel -lm_order 5 -lm_file lm.1.gz
+
+search = stack
+
+mark-oovs = false
+pop-limit = 10
+top-n = 1
+
+output-format = %i ||| %s ||| %f ||| %c
+
+include-align-index = true
+reordering-limit = 6
+
+# And these are the feature functions to activate.
+feature-function = OOVPenalty
+feature-function = WordPenalty
+feature-function = Distortion
+feature-function = PhrasePenalty -owner pt
+
+OOVPenalty 1.0
+Distortion 0.114849
+WordPenalty -0.201544
+PhrasePenalty -0.236965
+tm_pt_0 0.0370068
+tm_pt_1 0.0495759
+tm_pt_2 0.196742
+tm_pt_3 0.0745423
+lm_0 0.204412452147565
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a6bf6283/test/decoder/phrase/include-align-index/corpus.es
----------------------------------------------------------------------
diff --git a/test/decoder/phrase/include-align-index/corpus.es b/test/decoder/phrase/include-align-index/corpus.es
new file mode 100644
index 0000000..6e255f9
--- /dev/null
+++ b/test/decoder/phrase/include-align-index/corpus.es
@@ -0,0 +1 @@
+una estrategia republicana para obstaculizar la reelección de Obama
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a6bf6283/test/decoder/phrase/include-align-index/lm.1.gz
----------------------------------------------------------------------
diff --git a/test/decoder/phrase/include-align-index/lm.1.gz b/test/decoder/phrase/include-align-index/lm.1.gz
new file mode 100644
index 0000000..3f4c453
Binary files /dev/null and b/test/decoder/phrase/include-align-index/lm.1.gz differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a6bf6283/test/decoder/phrase/include-align-index/log
----------------------------------------------------------------------
diff --git a/test/decoder/phrase/include-align-index/log b/test/decoder/phrase/include-align-index/log
new file mode 100644
index 0000000..05cd80f
--- /dev/null
+++ b/test/decoder/phrase/include-align-index/log
@@ -0,0 +1,50 @@
+Parameters read from configuration file:
+ tm = 'moses -owner pt -maxspan 0 -path rules.1.gz -max-source-len 5'
+ featurefunction = 'StateMinimizingLanguageModel -lm_order 5 -lm_file lm.1.gz'
+ search = 'stack'
+ markoovs = 'false'
+ poplimit = '10'
+ topn = '1'
+ outputformat = '%i ||| %s ||| %f ||| %c'
+ includealignindex = 'true'
+ reorderinglimit = '6'
+ featurefunction = 'OOVPenalty'
+ featurefunction = 'WordPenalty'
+ featurefunction = 'Distortion'
+ featurefunction = 'PhrasePenalty -owner pt'
+Parameters overridden from the command line:
+ threads = '1'
+ c = 'config'
+Read 9 weights (0 of them dense)
+Reading grammar from file rules.1.gz...
+........10........20........30........40........50........60........70........80........90.....100%
+MemoryBasedBatchGrammar: Read 165161 rules with 18 distinct source sides from 'rules.1.gz'
+Couldn't create a GrammarReader for file null with format phrase
+MemoryBasedBatchGrammar: Read 0 rules with 0 distinct source sides from 'null'
+Memory used 219.6 MB
+Grammar loading took: 0 seconds.
+Stateful object with state index 0
+Loading the LM will be faster if you build a binary file.
+Reading lm.1.gz
+----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
+****************************************************************************************************
+FEATURE: tm_pt (weight 0.000)
+FEATURE: tm_custom (weight 0.000)
+FEATURE: lm_0, order 5 (weight 0.204)
+FEATURE: OOVPenalty (weight 1.000)
+FEATURE: WordPenalty (weight -0.202)
+FEATURE: Distortion (weight 0.115)
+FEATURE: PhrasePenalty (weight -0.237)
+Grammar sorting happening lazily on-demand.
+Model loading took 0 seconds
+Memory used 219.6 MB
+Input 0: <s> una estrategia republicana para obstaculizar la reelección de Obama </s>
+Input 0: Collecting options took 0.000 seconds
+Input 0: Search took 0.013 seconds
+Input 0: Translation took 1.532 seconds
+Input 0: Memory used is 392.5 MB
+Translation 0: -7.496 a strategy republican to hinder reelection Obama
+Input 0: 1-best extraction took 0.026 seconds
+Decoding completed.
+Memory used 401.6 MB
+Total running time: 2 seconds
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a6bf6283/test/decoder/phrase/include-align-index/output
----------------------------------------------------------------------
diff --git a/test/decoder/phrase/include-align-index/output b/test/decoder/phrase/include-align-index/output
new file mode 100644
index 0000000..509a3de
--- /dev/null
+++ b/test/decoder/phrase/include-align-index/output
@@ -0,0 +1 @@
+0 ||| a strategy |0-1| republican |2-2| to hinder |3-4| reelection |5-6| Obama |7-8| ||| tm_pt_0=-9.702 tm_pt_1=-10.800 tm_pt_2=-7.543 tm_pt_3=-8.555 lm_0=-19.117 OOVPenalty=0.000 WordPenalty=-3.040 Distortion=0.000 PhrasePenalty=5.000 ||| -7.496
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a6bf6283/test/decoder/phrase/include-align-index/output.gold
----------------------------------------------------------------------
diff --git a/test/decoder/phrase/include-align-index/output.gold b/test/decoder/phrase/include-align-index/output.gold
new file mode 100644
index 0000000..509a3de
--- /dev/null
+++ b/test/decoder/phrase/include-align-index/output.gold
@@ -0,0 +1 @@
+0 ||| a strategy |0-1| republican |2-2| to hinder |3-4| reelection |5-6| Obama |7-8| ||| tm_pt_0=-9.702 tm_pt_1=-10.800 tm_pt_2=-7.543 tm_pt_3=-8.555 lm_0=-19.117 OOVPenalty=0.000 WordPenalty=-3.040 Distortion=0.000 PhrasePenalty=5.000 ||| -7.496
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a6bf6283/test/decoder/phrase/include-align-index/rules.1.gz
----------------------------------------------------------------------
diff --git a/test/decoder/phrase/include-align-index/rules.1.gz b/test/decoder/phrase/include-align-index/rules.1.gz
new file mode 100644
index 0000000..14466e9
Binary files /dev/null and b/test/decoder/phrase/include-align-index/rules.1.gz differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a6bf6283/test/decoder/phrase/include-align-index/test.sh
----------------------------------------------------------------------
diff --git a/test/decoder/phrase/include-align-index/test.sh b/test/decoder/phrase/include-align-index/test.sh
new file mode 100644
index 0000000..4732f73
--- /dev/null
+++ b/test/decoder/phrase/include-align-index/test.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+set -u
+
+cat corpus.es | $JOSHUA/bin/joshua-decoder -threads 1 -c config > output 2> log
+
+# Compare
+diff -u output output.gold > diff
+
+if [ $? -eq 0 ]; then
+ rm -f diff output log
+ exit 0
+else
+ exit 1
+fi
+
+