You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/05 23:04:15 UTC
incubator-joshua git commit: updating handling of source-side annotations

Repository: incubator-joshua
Updated Branches:
  refs/heads/master 98ad70a26 -> 28e7c2293


updating handling of source-side annotations

Joshua now permits source-side annotations. Each word can have attached an unlimited number of ;-delimited key[=value] annotations attached to each input word, in brackets. For example:

  je[pos=PRO] suis[pos=VB,tense=present;irregular] ...

This will result in annotations being attached to each word, which are then available to feature functions and elsewhere in the decoder. If an annotation has no value, it because its own value.

If -source-annotations is passed to the decoder, annotations of the name "class" will be used in place of the target-side word, allowing a language model to key on source-side context.


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/28e7c229
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/28e7c229
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/28e7c229

Branch: refs/heads/master
Commit: 28e7c2293c7a337fff8d0190284ed677ab0964ee
Parents: 98ad70a
Author: Matt Post <po...@cs.jhu.edu>
Authored: Tue Apr 5 17:03:51 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Tue Apr 5 17:03:51 2016 -0400

----------------------------------------------------------------------
 src/joshua/decoder/ff/lm/LanguageModelFF.java |  6 +-
 src/joshua/decoder/segment_file/Sentence.java | 11 ++-
 src/joshua/decoder/segment_file/Token.java    | 80 +++++++++++++---------
 test/decoder/source-annotations/input.txt     |  2 +-
 test/decoder/source-annotations/test.sh       |  5 ++
 5 files changed, 62 insertions(+), 42 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/28e7c229/src/joshua/decoder/ff/lm/LanguageModelFF.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/lm/LanguageModelFF.java b/src/joshua/decoder/ff/lm/LanguageModelFF.java
index 75eedef..869ddae 100644
--- a/src/joshua/decoder/ff/lm/LanguageModelFF.java
+++ b/src/joshua/decoder/ff/lm/LanguageModelFF.java
@@ -233,11 +233,11 @@ public class LanguageModelFF extends StatefulFF {
         if (tokens[i] > 0) { // skip nonterminals
           for (int j = 0; j < alignments.length; j += 2) {
             if (alignments[j] == i) {
-              int annotation = sentence.getAnnotation((int)alignments[i] + begin);
-              if (annotation != -1) {
+              String annotation = sentence.getAnnotation((int)alignments[i] + begin, "class");
+              if (annotation != null) {
 //                System.err.println(String.format("  word %d source %d abs %d annotation %d/%s", 
 //                    i, alignments[i], alignments[i] + begin, annotation, Vocabulary.word(annotation)));
-                tokens[i] = annotation;
+                tokens[i] = Vocabulary.id(annotation);
                 break;
               }
             }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/28e7c229/src/joshua/decoder/segment_file/Sentence.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/segment_file/Sentence.java b/src/joshua/decoder/segment_file/Sentence.java
index 72415ec..f65f2f9 100644
--- a/src/joshua/decoder/segment_file/Sentence.java
+++ b/src/joshua/decoder/segment_file/Sentence.java
@@ -88,11 +88,6 @@ public class Sentence {
       }
       this.id = id;
     }
-    
-    // Mask strings that cause problems for the decoder
-    source = source.replaceAll("\\[",  "-lsb-")
-        .replaceAll("\\]",  "-rsb-")
-        .replaceAll("\\|",  "-pipe-");
   
     // Only trim strings
     if (joshuaConfiguration.lattice_decoding && ! source.startsWith("((("))
@@ -127,11 +122,13 @@ public class Sentence {
   /**
    * Returns the annotations for a specific word (specified by an index) in the 
    * sentence
+   * 
    * @param index The location of the word in the sentence
+   * @param key The annotation identity
    * @return The annotations associated with this word
    */
-  public int getAnnotation(int index) {
-    return getTokens().get(index).getAnnotation();
+  public String getAnnotation(int index, String key) {
+    return getTokens().get(index).getAnnotation(key);
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/28e7c229/src/joshua/decoder/segment_file/Token.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/segment_file/Token.java b/src/joshua/decoder/segment_file/Token.java
index 2d30a88..09fa0d6 100644
--- a/src/joshua/decoder/segment_file/Token.java
+++ b/src/joshua/decoder/segment_file/Token.java
@@ -1,54 +1,77 @@
 package joshua.decoder.segment_file;
 
+import java.util.HashMap;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 import joshua.corpus.Vocabulary;
 
 /**
- * Stores the identity of a word and its annotations in a sentence
+ * Stores the identity of a word and its annotations in a sentence.
+
  * @author "Gaurav Kumar"
- *
+ * @author Matt Post
  */
 public class Token {
   // The token without the annotations
   private String token; 
   private int tokenID;
-  // The annotation extracted from the raw token 
-  private String type;
-  private int typeID;
+
+  private HashMap<String,String> annotations = null;
 
   /**
    * Constructor : Creates a Token object from a raw word
    * Extracts and assigns an annotation when available.
-   * The current convention for annotations is $TYPE_(TOKEN)
-   * For e.g., $num_(34) or $place_(Baltimore)
-   * Annotations can only be alphanumeric
-   * The annotation is set to -1 if there is no annotation for this token 
+   * Any word can be marked with annotations, which are arbitrary semicolon-delimited
+   * key[=value] pairs (the value is optional) listed in brackets after a word, e.g.,
+   * 
+   *    Je[ref=Samuel;PRO] voudrais[FUT;COND] ...
+   * 
+   * This will create a dictionary annotation on the word of the following form for "Je"
+   * 
+   *   ref -> Samuel
+   *   PRO -> PRO
+   *   
+   * and the following for "voudrais":
+   * 
+   *   FUT  -> FUT
+   *   COND -> COND
    * 
    * @param rawWord A word with annotation information (possibly)
    *  
    */
   public Token(String rawWord) {
+    
+    annotations = new HashMap<String,String>();
+    
     // Matches a word with an annotation
     // Check guidelines in constructor description
-    Pattern annotation = Pattern.compile("\\$(\\S+)_\\(([^)]+)\\)");
-    Matcher tag = annotation.matcher(rawWord);
+    Pattern pattern = Pattern.compile("(\\S+)\\[(\\S+)\\]");
+    Matcher tag = pattern.matcher(rawWord);
     if (tag.find()) {
       // Annotation match found
-      type = tag.group(1);
-      token = tag.group(2);
+      token = tag.group(1);
+      String tagStr = tag.group(2);
+
+      for (String annotation: tagStr.split(";")) {
+        int where = annotation.indexOf("=");
+        if (where != -1) {
+          annotations.put(annotation.substring(0, where), annotation.substring(where + 1));
+        } else {
+          annotations.put(annotation, annotation);
+        }
+      }
     } else {
-      // No match found, which implies that this token does not have an 
-      // associated annotation
+      // No match found, which implies that this token does not have any annotations 
       token = rawWord;
-      type = null;
     }
-    // Get the Vocabulary ID for the token and the tyoe
-    // The type string is also in the vocabulary since the LM
-    // needs an integer version of the type. 
+
+    // Mask strings that cause problems for the decoder
+    token = token.replaceAll("\\[",  "-lsb-")
+        .replaceAll("\\]",  "-rsb-")
+        .replaceAll("\\|",  "-pipe-");
+
     tokenID = Vocabulary.id(token);
-    typeID = type != null ? Vocabulary.id(type) : -1;
   }
 
   /**
@@ -73,16 +96,11 @@ public class Token {
    * associated with this token
    * @return int A type ID
    */
-  public int getAnnotation() {
-    return typeID;
-  }
-
-  /**
-   * Returns the string version of the annotation
-   * associated with this token
-   * @return String A type
-   */
-  public String getTypeIdentity() {
-    return type;
+  public String getAnnotation(String key) {
+    if (annotations.containsKey(key)) {
+      return annotations.get(key);
+    }
+    
+    return null;
   }
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/28e7c229/test/decoder/source-annotations/input.txt
----------------------------------------------------------------------
diff --git a/test/decoder/source-annotations/input.txt b/test/decoder/source-annotations/input.txt
index a4a06b0..f0a8a76 100644
--- a/test/decoder/source-annotations/input.txt
+++ b/test/decoder/source-annotations/input.txt
@@ -1 +1 @@
-$mattpost_(mis) amigos me llaman
+mis[tag=ADJ;num=PL;class=OOV] amigos me llaman

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/28e7c229/test/decoder/source-annotations/test.sh
----------------------------------------------------------------------
diff --git a/test/decoder/source-annotations/test.sh b/test/decoder/source-annotations/test.sh
index 1eb9950..49a00a0 100755
--- a/test/decoder/source-annotations/test.sh
+++ b/test/decoder/source-annotations/test.sh
@@ -1,5 +1,10 @@
 #!/bin/bash
 
+# Tests the language model code that uses the source-side projection instead of the word itself.
+# When translating a word, if there is a source-side annotation of the label "class", and
+# -source-annotations was added to the invocation, the LM will use that source-side class instead
+# of the translated word.
+
 set -u
 
 cat input.txt | $JOSHUA/bin/joshua-decoder -threads 1 -c joshua.config > output 2> log