You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/26 23:55:44 UTC

[8/9] incubator-joshua git commit: fixed "source annotations" test case by moving character rewriting

fixed "source annotations" test case by moving character rewriting

Source-side annotations can be added by appending ;-delimited "key=value" annotations after input tokens, in square brackets, e.g.,

    i[POS=PRO] want[POS=VB;TENSE=pr;person=sg] to go...

so the replacement of square brackets has to happen *after* this is parsed (in Token.java)


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/b82027d0
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/b82027d0
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/b82027d0

Branch: refs/heads/master
Commit: b82027d09cc950a366b032c2d61ac6ade85bff0a
Parents: 7fdc4cd
Author: Matt Post <po...@cs.jhu.edu>
Authored: Tue Apr 26 17:54:16 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Tue Apr 26 17:54:16 2016 -0400

----------------------------------------------------------------------
 src/joshua/decoder/segment_file/Sentence.java | 3 ---
 src/joshua/decoder/segment_file/Token.java    | 9 +++++----
 2 files changed, 5 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b82027d0/src/joshua/decoder/segment_file/Sentence.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/segment_file/Sentence.java b/src/joshua/decoder/segment_file/Sentence.java
index 970b387..08ecf89 100644
--- a/src/joshua/decoder/segment_file/Sentence.java
+++ b/src/joshua/decoder/segment_file/Sentence.java
@@ -113,9 +113,6 @@ public class Sentence {
       this.id = id;
     }
     
-    // Mask strings that cause problems for the decoder
-    source = escapeSpecialSymbols(source);
-  
     // Only trim strings
     if (joshuaConfiguration.lattice_decoding && ! source.startsWith("((("))
       adjustForLength(joshuaConfiguration.maxlen);

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b82027d0/src/joshua/decoder/segment_file/Token.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/segment_file/Token.java b/src/joshua/decoder/segment_file/Token.java
index ebe9a43..bddfd68 100644
--- a/src/joshua/decoder/segment_file/Token.java
+++ b/src/joshua/decoder/segment_file/Token.java
@@ -18,6 +18,8 @@
  */
 package joshua.decoder.segment_file;
 
+import static joshua.util.FormatUtils.escapeSpecialSymbols;
+
 import java.util.HashMap;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
@@ -90,10 +92,9 @@ public class Token {
       token = rawWord;
     }
 
-    // Mask strings that cause problems for the decoder
-    token = token.replaceAll("\\[",  "-lsb-")
-        .replaceAll("\\]",  "-rsb-")
-        .replaceAll("\\|",  "-pipe-");
+    // Mask strings that cause problems for the decoder. This has to be done *after* parsing for
+    // annotations.
+    token = escapeSpecialSymbols(token);
 
     if (joshuaConfiguration != null && joshuaConfiguration.lowercase) {
       if (FormatUtils.ISALLUPPERCASE(token))