You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/29 06:47:11 UTC

[09/10] incubator-joshua git commit: removed non-functioning include_align_index code, added test case in case it's restored

removed non-functioning include_align_index code, added test case in case it's restored


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/a6bf6283
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/a6bf6283
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/a6bf6283

Branch: refs/heads/master
Commit: a6bf6283aedfd4b3538bc36cd9264c9b610281ff
Parents: 73523c5
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Apr 29 00:45:09 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Apr 29 00:45:09 2016 -0400

----------------------------------------------------------------------
 .../decoder/hypergraph/KBestExtractor.java      |  25 +++-------
 test/decoder/phrase/include-align-index/README  |   2 +
 test/decoder/phrase/include-align-index/config  |  29 +++++++++++
 .../phrase/include-align-index/corpus.es        |   1 +
 test/decoder/phrase/include-align-index/lm.1.gz | Bin 0 -> 2235 bytes
 test/decoder/phrase/include-align-index/log     |  50 +++++++++++++++++++
 test/decoder/phrase/include-align-index/output  |   1 +
 .../phrase/include-align-index/output.gold      |   1 +
 .../phrase/include-align-index/rules.1.gz       | Bin 0 -> 2998042 bytes
 test/decoder/phrase/include-align-index/test.sh |  17 +++++++
 10 files changed, 108 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a6bf6283/src/joshua/decoder/hypergraph/KBestExtractor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/KBestExtractor.java b/src/joshua/decoder/hypergraph/KBestExtractor.java
index a41ee66..98f6f15 100644
--- a/src/joshua/decoder/hypergraph/KBestExtractor.java
+++ b/src/joshua/decoder/hypergraph/KBestExtractor.java
@@ -166,18 +166,19 @@ public class KBestExtractor {
   public String getKthHyp(HGNode node, int k) {
 
     String outputString = null;
-
+    
     // Determine the k-best hypotheses at each HGNode
     VirtualNode virtualNode = getVirtualNode(node);
     DerivationState derivationState = virtualNode.lazyKBestExtractOnNode(this, k);
 //    DerivationState derivationState = getKthDerivation(node, k);
     if (derivationState != null) {
       // ==== read the kbest from each hgnode and convert to output format
-      String hypothesis = formatForOutput(
+      String hypothesis = maybeProjectCase(
                             unescapeSpecialSymbols(
                               removeSentenceMarkers(
                                 derivationState.getHypothesis())), derivationState);
       
+      
       /*
        * To save space, the decoder only stores the model cost,
        * no the individual feature values.
@@ -190,8 +191,8 @@ public class KBestExtractor {
 
       outputString = outputFormat
           .replace("%k", Integer.toString(k))
-          .replace("%s", formatForOutput(hypothesis, derivationState))
-          .replace("%S", DeNormalize.processSingleLine(formatForOutput(hypothesis, derivationState)))
+          .replace("%s", hypothesis)
+          .replace("%S", DeNormalize.processSingleLine(hypothesis))
           // TODO (kellens): Fix the recapitalization here
           .replace("%i", Integer.toString(sentence.id()))
           .replace("%f", joshuaConfiguration.moses ? features.mosesString() : features.toString())
@@ -230,9 +231,9 @@ public class KBestExtractor {
    * @param state
    * @return
    */
-  private String formatForOutput(String hypothesis, DerivationState state) {
+  private String maybeProjectCase(String hypothesis, DerivationState state) {
     String output = hypothesis;
-    
+
     if (joshuaConfiguration.project_case) {
       String[] tokens = hypothesis.split("\\s+");
       List<List<Integer>> points = state.getWordAlignment();
@@ -253,18 +254,6 @@ public class KBestExtractor {
       output = String.join(" ",  tokens);
     }
 
-    if (joshuaConfiguration.include_align_index) {
-      String[] tokens = hypothesis.split("\\s+");
-      List<List<Integer>> points = state.getWordAlignment();
-      for (int i = 0; i < tokens.length; i++) {
-        if (i < points.size()) {
-          tokens[i] += String.format(" %d-%d",  points.get(i).get(0), 
-              points.get(i).get(points.get(i).size()-1));
-        }
-      }
-      output = String.join(" ",  tokens);
-    }
-
     return output;
   }
 

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a6bf6283/test/decoder/phrase/include-align-index/README
----------------------------------------------------------------------
diff --git a/test/decoder/phrase/include-align-index/README b/test/decoder/phrase/include-align-index/README
new file mode 100644
index 0000000..d0c0813
--- /dev/null
+++ b/test/decoder/phrase/include-align-index/README
@@ -0,0 +1,2 @@
+Added non-functioning example that will test outputting phrase alignments if
+that ability is ever restored.

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a6bf6283/test/decoder/phrase/include-align-index/config
----------------------------------------------------------------------
diff --git a/test/decoder/phrase/include-align-index/config b/test/decoder/phrase/include-align-index/config
new file mode 100644
index 0000000..f30014d
--- /dev/null
+++ b/test/decoder/phrase/include-align-index/config
@@ -0,0 +1,29 @@
+tm = moses -owner pt -maxspan 0 -path rules.1.gz -max-source-len 5
+feature-function = StateMinimizingLanguageModel -lm_order 5 -lm_file lm.1.gz
+
+search = stack
+
+mark-oovs = false
+pop-limit = 10
+top-n = 1
+
+output-format = %i ||| %s ||| %f ||| %c
+
+include-align-index = true
+reordering-limit = 6
+
+# And these are the feature functions to activate.
+feature-function = OOVPenalty
+feature-function = WordPenalty
+feature-function = Distortion
+feature-function = PhrasePenalty -owner pt
+
+OOVPenalty 1.0
+Distortion 0.114849
+WordPenalty -0.201544
+PhrasePenalty -0.236965
+tm_pt_0 0.0370068
+tm_pt_1 0.0495759
+tm_pt_2 0.196742
+tm_pt_3 0.0745423
+lm_0 0.204412452147565

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a6bf6283/test/decoder/phrase/include-align-index/corpus.es
----------------------------------------------------------------------
diff --git a/test/decoder/phrase/include-align-index/corpus.es b/test/decoder/phrase/include-align-index/corpus.es
new file mode 100644
index 0000000..6e255f9
--- /dev/null
+++ b/test/decoder/phrase/include-align-index/corpus.es
@@ -0,0 +1 @@
+una estrategia republicana para obstaculizar la reelección de Obama 

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a6bf6283/test/decoder/phrase/include-align-index/lm.1.gz
----------------------------------------------------------------------
diff --git a/test/decoder/phrase/include-align-index/lm.1.gz b/test/decoder/phrase/include-align-index/lm.1.gz
new file mode 100644
index 0000000..3f4c453
Binary files /dev/null and b/test/decoder/phrase/include-align-index/lm.1.gz differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a6bf6283/test/decoder/phrase/include-align-index/log
----------------------------------------------------------------------
diff --git a/test/decoder/phrase/include-align-index/log b/test/decoder/phrase/include-align-index/log
new file mode 100644
index 0000000..05cd80f
--- /dev/null
+++ b/test/decoder/phrase/include-align-index/log
@@ -0,0 +1,50 @@
+Parameters read from configuration file:
+    tm = 'moses -owner pt -maxspan 0 -path rules.1.gz -max-source-len 5'
+    featurefunction = 'StateMinimizingLanguageModel -lm_order 5 -lm_file lm.1.gz'
+    search = 'stack'
+    markoovs = 'false'
+    poplimit = '10'
+    topn = '1'
+    outputformat = '%i ||| %s ||| %f ||| %c'
+    includealignindex = 'true'
+    reorderinglimit = '6'
+    featurefunction = 'OOVPenalty'
+    featurefunction = 'WordPenalty'
+    featurefunction = 'Distortion'
+    featurefunction = 'PhrasePenalty -owner pt'
+Parameters overridden from the command line:
+    threads = '1'
+    c = 'config'
+Read 9 weights (0 of them dense)
+Reading grammar from file rules.1.gz...
+........10........20........30........40........50........60........70........80........90.....100%
+MemoryBasedBatchGrammar: Read 165161 rules with 18 distinct source sides from 'rules.1.gz'
+Couldn't create a GrammarReader for file null with format phrase
+MemoryBasedBatchGrammar: Read 0 rules with 0 distinct source sides from 'null'
+Memory used 219.6 MB
+Grammar loading took: 0 seconds.
+Stateful object with state index 0
+Loading the LM will be faster if you build a binary file.
+Reading lm.1.gz
+----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
+****************************************************************************************************
+FEATURE: tm_pt (weight 0.000)
+FEATURE: tm_custom (weight 0.000)
+FEATURE: lm_0, order 5 (weight 0.204)
+FEATURE: OOVPenalty (weight 1.000)
+FEATURE: WordPenalty (weight -0.202)
+FEATURE: Distortion (weight 0.115)
+FEATURE: PhrasePenalty (weight -0.237)
+Grammar sorting happening lazily on-demand.
+Model loading took 0 seconds
+Memory used 219.6 MB
+Input 0: <s> una estrategia republicana para obstaculizar la reelección de Obama </s>
+Input 0: Collecting options took 0.000 seconds
+Input 0: Search took 0.013 seconds
+Input 0: Translation took 1.532 seconds
+Input 0: Memory used is 392.5 MB
+Translation 0: -7.496 a strategy republican to hinder reelection Obama 
+Input 0: 1-best extraction took 0.026 seconds
+Decoding completed.
+Memory used 401.6 MB
+Total running time: 2 seconds

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a6bf6283/test/decoder/phrase/include-align-index/output
----------------------------------------------------------------------
diff --git a/test/decoder/phrase/include-align-index/output b/test/decoder/phrase/include-align-index/output
new file mode 100644
index 0000000..509a3de
--- /dev/null
+++ b/test/decoder/phrase/include-align-index/output
@@ -0,0 +1 @@
+0 ||| a strategy |0-1| republican |2-2| to hinder |3-4| reelection |5-6| Obama |7-8| ||| tm_pt_0=-9.702 tm_pt_1=-10.800 tm_pt_2=-7.543 tm_pt_3=-8.555 lm_0=-19.117 OOVPenalty=0.000 WordPenalty=-3.040 Distortion=0.000 PhrasePenalty=5.000 ||| -7.496

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a6bf6283/test/decoder/phrase/include-align-index/output.gold
----------------------------------------------------------------------
diff --git a/test/decoder/phrase/include-align-index/output.gold b/test/decoder/phrase/include-align-index/output.gold
new file mode 100644
index 0000000..509a3de
--- /dev/null
+++ b/test/decoder/phrase/include-align-index/output.gold
@@ -0,0 +1 @@
+0 ||| a strategy |0-1| republican |2-2| to hinder |3-4| reelection |5-6| Obama |7-8| ||| tm_pt_0=-9.702 tm_pt_1=-10.800 tm_pt_2=-7.543 tm_pt_3=-8.555 lm_0=-19.117 OOVPenalty=0.000 WordPenalty=-3.040 Distortion=0.000 PhrasePenalty=5.000 ||| -7.496

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a6bf6283/test/decoder/phrase/include-align-index/rules.1.gz
----------------------------------------------------------------------
diff --git a/test/decoder/phrase/include-align-index/rules.1.gz b/test/decoder/phrase/include-align-index/rules.1.gz
new file mode 100644
index 0000000..14466e9
Binary files /dev/null and b/test/decoder/phrase/include-align-index/rules.1.gz differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/a6bf6283/test/decoder/phrase/include-align-index/test.sh
----------------------------------------------------------------------
diff --git a/test/decoder/phrase/include-align-index/test.sh b/test/decoder/phrase/include-align-index/test.sh
new file mode 100644
index 0000000..4732f73
--- /dev/null
+++ b/test/decoder/phrase/include-align-index/test.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+set -u
+
+cat corpus.es | $JOSHUA/bin/joshua-decoder -threads 1 -c config > output 2> log
+
+# Compare
+diff -u output output.gold > diff
+
+if [ $? -eq 0 ]; then
+  rm -f diff output log
+  exit 0
+else
+  exit 1
+fi
+
+