You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/26 23:55:39 UTC
[3/9] incubator-joshua git commit: Properly clean static decoder variables on cleanUp(). Allows multiple unit tests with joshua instances. Also some cleanup and proper testing of FormatUtils functions.

Properly clean static decoder variables on cleanUp(). Allows multiple unit tests with joshua instances. Also some cleanup and proper testing of FormatUtils functions.


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/b2ec94fb
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/b2ec94fb
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/b2ec94fb

Branch: refs/heads/master
Commit: b2ec94fbcc6c0207f9830f32e4d8747c5be02c30
Parents: b9b512e
Author: Felix Hieber <fh...@amazon.com>
Authored: Thu Nov 19 19:32:59 2015 +0530
Committer: Kellen Sunderland <ke...@amazon.com>
Committed: Mon Apr 25 19:46:32 2016 -0700

----------------------------------------------------------------------
 src/joshua/corpus/Vocabulary.java               | 10 +++-
 src/joshua/decoder/Decoder.java                 | 15 ++++-
 src/joshua/decoder/DecoderThread.java           |  2 +-
 src/joshua/decoder/ff/lm/LanguageModelFF.java   | 22 ++++---
 .../ff/lm/StateMinimizingLanguageModel.java     |  2 -
 src/joshua/decoder/segment_file/Sentence.java   | 19 ++++++-
 src/joshua/util/FormatUtils.java                | 32 +++++++++++
 tst/joshua/util/FormatUtilsTest.java            | 60 ++++++++++++++++++++
 8 files changed, 145 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b2ec94fb/src/joshua/corpus/Vocabulary.java
----------------------------------------------------------------------
diff --git a/src/joshua/corpus/Vocabulary.java b/src/joshua/corpus/Vocabulary.java
index e598203..1792219 100644
--- a/src/joshua/corpus/Vocabulary.java
+++ b/src/joshua/corpus/Vocabulary.java
@@ -46,7 +46,7 @@ import joshua.util.FormatUtils;
 
 public class Vocabulary {
 
-  private final static ArrayList<NGramLanguageModel> lms = new ArrayList<NGramLanguageModel>();
+  private final static ArrayList<NGramLanguageModel> LMs = new ArrayList<NGramLanguageModel>();
 
   private static List<String> idToString;
   private static Map<String, Integer> stringToId;
@@ -68,7 +68,7 @@ public class Vocabulary {
   public static boolean registerLanguageModel(NGramLanguageModel lm) {
     synchronized (lock) {
       // Store the language model.
-      lms.add(lm);
+      LMs.add(lm);
       // Notify it of all the existing words.
       boolean collision = false;
       for (int i = idToString.size() - 1; i > 0; i--)
@@ -141,7 +141,7 @@ public class Vocabulary {
         // register this (token,id) mapping with each language
         // model, so that they can map it to their own private
         // vocabularies
-        for (NGramLanguageModel lm : lms)
+        for (NGramLanguageModel lm : LMs)
           lm.registerWord(token, Math.abs(id));
 
         idToString.add(token);
@@ -269,4 +269,8 @@ public class Vocabulary {
     }
   }
   
+  public static void unregisterLanguageModels() {
+    LMs.clear();
+  }
+  
 }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b2ec94fb/src/joshua/decoder/Decoder.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/Decoder.java b/src/joshua/decoder/Decoder.java
index 739ee41..1b12dda 100644
--- a/src/joshua/decoder/Decoder.java
+++ b/src/joshua/decoder/Decoder.java
@@ -41,6 +41,8 @@ import joshua.decoder.JoshuaConfiguration.INPUT_TYPE;
 import joshua.decoder.JoshuaConfiguration.SERVER_TYPE;
 import joshua.decoder.ff.FeatureFunction;
 import joshua.decoder.ff.PhraseModel;
+import joshua.decoder.ff.StatefulFF;
+import joshua.decoder.ff.lm.LanguageModelFF;
 import joshua.decoder.ff.tm.Grammar;
 import joshua.decoder.ff.tm.Rule;
 import joshua.decoder.ff.tm.Trie;
@@ -514,15 +516,26 @@ public class Decoder {
     return null;
   }
 
+  /**
+   * Clean shutdown of Decoder, resetting all
+   * static variables, such that any other instance of Decoder
+   * afterwards gets a fresh start.
+   */
   public void cleanUp() {
+    // shut down DecoderThreads
     for (DecoderThread thread : threadPool) {
       try {
         thread.join();
       } catch (InterruptedException e) {
-        // TODO Auto-generated catch block
         e.printStackTrace();
       }
     }
+    // clear/reset static variables
+    DENSE_FEATURE_NAMES.clear();
+    Vocabulary.clear();
+    Vocabulary.unregisterLanguageModels();
+    LanguageModelFF.resetLmIndex();
+    StatefulFF.resetGlobalStateIndex();
   }
 
   public static void writeConfigFile(double[] newWeights, String template, String outputFile,

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b2ec94fb/src/joshua/decoder/DecoderThread.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/DecoderThread.java b/src/joshua/decoder/DecoderThread.java
index 6bc507f..4e2a15c 100644
--- a/src/joshua/decoder/DecoderThread.java
+++ b/src/joshua/decoder/DecoderThread.java
@@ -146,7 +146,7 @@ public class DecoderThread extends Thread {
 
     float seconds = (System.currentTimeMillis() - startTime) / 1000.0f;
     Decoder.LOG(1, String.format("Input %d: Translation took %.3f seconds", sentence.id(), seconds));
-    Decoder.LOG(1, String.format("Memory used after sentence %d is %.1f MB", sentence.id(), (Runtime
+    Decoder.LOG(1, String.format("Input %d: Memory used is %.1f MB", sentence.id(), (Runtime
         .getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()) / 1000000.0));
 
     /* Return the translation unless we're doing synchronous parsing. */

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b2ec94fb/src/joshua/decoder/ff/lm/LanguageModelFF.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/lm/LanguageModelFF.java b/src/joshua/decoder/ff/lm/LanguageModelFF.java
index 18c149d..38f1a74 100644
--- a/src/joshua/decoder/ff/lm/LanguageModelFF.java
+++ b/src/joshua/decoder/ff/lm/LanguageModelFF.java
@@ -54,9 +54,8 @@ import joshua.decoder.segment_file.Sentence;
  */
 public class LanguageModelFF extends StatefulFF {
 
-  private static int LM_INDEX = 0;
-  public static int START_SYM_ID;
-  public static int STOP_SYM_ID;
+  public static int LM_INDEX = 0;
+  private int startSymbolId;
 
   /**
    * N-gram language model. We assume the language model is in ARPA format for equivalent state:
@@ -164,7 +163,7 @@ public class LanguageModelFF extends StatefulFF {
    * @param type
    * @param path
    */
-  public void initializeLM() {
+  protected void initializeLM() {
     if (type.equals("kenlm")) {
       this.languageModel = new KenLM(ngramOrder, path);
     
@@ -180,8 +179,7 @@ public class LanguageModelFF extends StatefulFF {
     Vocabulary.registerLanguageModel(this.languageModel);
     Vocabulary.id(config.default_non_terminal);
     
-    LanguageModelFF.START_SYM_ID = Vocabulary.id(Vocabulary.START_SYM);
-    LanguageModelFF.STOP_SYM_ID = Vocabulary.id(Vocabulary.STOP_SYM);
+    startSymbolId = Vocabulary.id(Vocabulary.START_SYM);
   }
 
   public NGramLanguageModel getLM() {
@@ -310,7 +308,7 @@ public class LanguageModelFF extends StatefulFF {
     int[] enWords = rule.getEnglish();
 
     List<Integer> words = new ArrayList<Integer>();
-    boolean skipStart = (enWords[0] == START_SYM_ID);
+    boolean skipStart = (enWords[0] == startSymbolId);
 
     /*
      * Move through the words, accumulating language model costs each time we have an n-gram (n >=
@@ -349,7 +347,7 @@ public class LanguageModelFF extends StatefulFF {
 
       boolean considerIncompleteNgrams = true;
       boolean skipStart = true;
-      if (words.get(0) != START_SYM_ID) {
+      if (words.get(0) != startSymbolId) {
         skipStart = false;
       }
       estimate += scoreChunkLogP(words, considerIncompleteNgrams, skipStart);
@@ -507,4 +505,12 @@ public class LanguageModelFF extends StatefulFF {
 
     return score;
   }
+  
+  /**
+   * Public method to set LM_INDEX back to 0.
+   * Required if multiple instances of the JoshuaDecoder live in the same JVM.
+   */
+  public static void resetLmIndex() {
+    LM_INDEX = 0;
+  }
 }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b2ec94fb/src/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java b/src/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
index 8f2b8a3..5e406de 100644
--- a/src/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
+++ b/src/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
@@ -80,8 +80,6 @@ public class StateMinimizingLanguageModel extends LanguageModelFF {
     Vocabulary.registerLanguageModel(this.languageModel);
     Vocabulary.id(config.default_non_terminal);
     
-    LanguageModelFF.START_SYM_ID = Vocabulary.id(Vocabulary.START_SYM);
-    LanguageModelFF.STOP_SYM_ID = Vocabulary.id(Vocabulary.STOP_SYM);
   }
   
   /**

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b2ec94fb/src/joshua/decoder/segment_file/Sentence.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/segment_file/Sentence.java b/src/joshua/decoder/segment_file/Sentence.java
index b51d509..970b387 100644
--- a/src/joshua/decoder/segment_file/Sentence.java
+++ b/src/joshua/decoder/segment_file/Sentence.java
@@ -18,6 +18,9 @@
  */
 package joshua.decoder.segment_file;
 
+import static joshua.util.FormatUtils.addSentenceMarkers;
+import static joshua.util.FormatUtils.escapeSpecialSymbols;
+
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.Iterator;
@@ -56,7 +59,10 @@ public class Sentence {
    * alignment or forced decoding.
    */
   protected String source = null;
+  protected String fullSource = null;
+  
   protected String target = null;
+  protected String fullTarget = null;
   protected String[] references = null;
 
   /* Lattice representation of the source sentence. */
@@ -106,6 +112,9 @@ public class Sentence {
       }
       this.id = id;
     }
+    
+    // Mask strings that cause problems for the decoder
+    source = escapeSpecialSymbols(source);
   
     // Only trim strings
     if (joshuaConfiguration.lattice_decoding && ! source.startsWith("((("))
@@ -311,7 +320,10 @@ public class Sentence {
    * @return String The input sentence with start and stop symbols
    */
   public String fullSource() {
-    return String.format("%s %s %s", Vocabulary.START_SYM , source(), Vocabulary.STOP_SYM); 
+    if (fullSource == null) {
+      fullSource = addSentenceMarkers(source());
+    }
+    return fullSource;  
   }
 
   /**
@@ -329,7 +341,10 @@ public class Sentence {
   }
 
   public String fullTarget() {
-    return String.format("<s> %s </s>", target());
+    if (fullTarget == null) {
+      fullTarget = addSentenceMarkers(target());
+    }
+    return fullTarget; 
   }
 
   public String source(int i, int j) {

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b2ec94fb/src/joshua/util/FormatUtils.java
----------------------------------------------------------------------
diff --git a/src/joshua/util/FormatUtils.java b/src/joshua/util/FormatUtils.java
index c196328..67b2bf3 100644
--- a/src/joshua/util/FormatUtils.java
+++ b/src/joshua/util/FormatUtils.java
@@ -22,6 +22,8 @@ import java.io.PrintStream;
 import java.io.UnsupportedEncodingException;
 import java.util.regex.Pattern;
 
+import joshua.corpus.Vocabulary;
+
 /**
  * Utility class for format issues.
  * 
@@ -94,8 +96,38 @@ public class FormatUtils {
   }
 
   public static String markup(String nt, int index) {
+    if (isNonterminal(nt)) {
+      return markup(cleanNonTerminal(nt), index);
+    }
     return "[" + nt + INDEX_SEPARATOR + index + "]";
   }
+  
+  public static String escapeSpecialSymbols(String s) {
+    return s.replaceAll("\\[",  "-lsb-")
+            .replaceAll("\\]",  "-rsb-")
+            .replaceAll("\\|",  "-pipe-");
+  }
+  
+  public static String unescapeSpecialSymbols(String s) {
+    return s.replaceAll("-lsb-", "[")
+            .replaceAll("-rsb-", "]")
+            .replaceAll("-pipe-", "|");
+  }
+  
+  /**
+   * wrap sentence with sentence start/stop markers 
+   * as defined by Vocabulary; separated by a single whitespace.
+   */
+  public static String addSentenceMarkers(String s) {
+    return Vocabulary.START_SYM + " " + s + " " + Vocabulary.STOP_SYM;
+  }
+  
+  /**
+   * strip sentence markers (and whitespaces) from string
+   */
+  public static String removeSentenceMarkers(String s) {
+    return s.replaceAll("<s> ", "").replace(" </s>", "");
+  }
 
   /**
    * Returns true if the String parameter represents a valid number.

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b2ec94fb/tst/joshua/util/FormatUtilsTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/util/FormatUtilsTest.java b/tst/joshua/util/FormatUtilsTest.java
new file mode 100644
index 0000000..da406cb
--- /dev/null
+++ b/tst/joshua/util/FormatUtilsTest.java
@@ -0,0 +1,60 @@
+package joshua.util;
+
+import static joshua.util.FormatUtils.cleanNonTerminal;
+import static joshua.util.FormatUtils.escapeSpecialSymbols;
+import static joshua.util.FormatUtils.isNonterminal;
+import static joshua.util.FormatUtils.markup;
+import static joshua.util.FormatUtils.stripNonTerminalIndex;
+import static joshua.util.FormatUtils.unescapeSpecialSymbols;
+import static org.junit.Assert.*;
+
+import org.junit.Test;
+
+public class FormatUtilsTest {
+  
+  @Test
+  public void givenTokens_whenIsNonTerminal_thenTokensCorrectlyClassified() {
+    assertTrue(isNonterminal("[X]"));
+    assertTrue(isNonterminal("[X,1]"));
+    assertFalse(isNonterminal("[]"));
+    assertFalse(isNonterminal("[X)"));
+  }
+  
+  @Test
+  public void givenTokens_whenCleanNonTerminal_thenCorrectlyCleaned() {
+    assertEquals(cleanNonTerminal("[GOAL]"), "GOAL");
+    assertEquals(cleanNonTerminal("[X]"), "X");
+    assertEquals(cleanNonTerminal("[X,1]"), "X");
+    assertEquals(cleanNonTerminal("bla"), "bla");
+    assertEquals(cleanNonTerminal("[bla"), "[bla");
+  }
+  
+  @Test
+  public void givenTokens_whenStripNonTerminalIndex_thenCorrectlyStripped() {
+    assertEquals(stripNonTerminalIndex("[X,1]"), "[X]");
+    assertEquals(stripNonTerminalIndex("[X,114]"), "[X]");
+    assertEquals(stripNonTerminalIndex("[X,]"), "[X]");
+    assertEquals(stripNonTerminalIndex("[X]"), "[X]");
+    assertEquals(stripNonTerminalIndex("[X"), "[[X]");
+  }
+  
+  @Test
+  public void givenTokens_whenMarkup_thenCorrectMarkup() {
+    assertEquals(markup("X"), "[X]");
+    assertEquals(markup("X", 1), "[X,1]");
+    assertEquals(markup("X", 15), "[X,15]");
+    assertEquals(markup("[X]", 1), "[X,1]");
+    assertEquals(markup("[X,1]", 4), "[X,4]");
+  }
+  
+  @Test
+  public void givenSpecialSymbols_whenEscapeSpecialSymbols_thenCorrectlyEscaped() {
+    assertEquals(escapeSpecialSymbols("[ ] | ["), "-lsb- -rsb- -pipe- -lsb-");
+  }
+  
+  @Test
+  public void givenEscapedSpecialSymbols_whenUnEscapeSpecialSymbols_thenCorrectlyUnEscaped() {
+    assertEquals(unescapeSpecialSymbols("-lsb- -rsb- -pipe- -lsb-"), "[ ] | [");
+  }
+
+}