You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/29 06:47:04 UTC

[02/10] incubator-joshua git commit: Modified KenLM jni to support querying the lm using strings not only ids. Also added a method to check whether a word or id is known to the lm. Made output of regression tests more concise

Modified KenLM jni to support querying the lm using strings not only ids. Also added a method to check whether a word or id is known to the lm.
Made output of regression tests more concise


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/cc556006
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/cc556006
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/cc556006

Branch: refs/heads/master
Commit: cc556006e646677f5c0e32755547b2657f59ff92
Parents: d25464c
Author: Felix Hieber <fh...@amazon.com>
Authored: Thu Dec 31 11:57:18 2015 +0100
Committer: Kellen Sunderland <ke...@amazon.com>
Committed: Thu Apr 28 15:42:05 2016 -0700

----------------------------------------------------------------------
 jni/kenlm_wrap.cc                   | 57 ++++++++++++++++++++++++++++++--
 src/joshua/decoder/ff/lm/KenLM.java | 35 +++++++++++++++++++-
 tst/joshua/system/KenLmTest.java    | 55 ++++++++++++++++++++++--------
 3 files changed, 130 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/cc556006/jni/kenlm_wrap.cc
----------------------------------------------------------------------
diff --git a/jni/kenlm_wrap.cc b/jni/kenlm_wrap.cc
index 6d66f37..16cb54b 100644
--- a/jni/kenlm_wrap.cc
+++ b/jni/kenlm_wrap.cc
@@ -12,7 +12,7 @@
 #include <jni.h>
 #include <pthread.h>
 
-// Grr.  Everybody's compiler is slightly different and I'm trying to not depend on boost.   
+// Grr.  Everybody's compiler is slightly different and I'm trying to not depend on boost.
 #include <unordered_map>
 
 // Verify that jint and lm::ngram::WordIndex are the same size. If this breaks
@@ -55,7 +55,7 @@ struct Chart {
 
   lm::ngram::ChartState* put(const lm::ngram::ChartState& state) {
     uint64_t hashValue = lm::ngram::hash_value(state);
-  
+
     if (poolHash->find(hashValue) == poolHash->end()) {
       lm::ngram::ChartState* pointer = (lm::ngram::ChartState *)pool->Allocate(sizeof(lm::ngram::ChartState));
       *pointer = state;
@@ -67,7 +67,7 @@ struct Chart {
 };
 
 // Vocab ids above what the vocabulary knows about are unknown and should
-// be mapped to that. 
+// be mapped to that.
 void MapArray(const std::vector<lm::WordIndex>& map, jint *begin, jint *end) {
   for (jint *i = begin; i < end; ++i) {
     *i = map[*i];
@@ -88,8 +88,17 @@ public:
   virtual ~VirtualBase() {
   }
 
+  // compute/return n-gram probability for array of Joshua word ids
   virtual float Prob(jint *begin, jint *end) const = 0;
 
+  // Compute/return n-gram probability for array of lm:WordIndexes
+  virtual float ProbForWordIndexArray(jint *begin, jint *end) const = 0;
+
+  // Returns the internal lm::WordIndex for a string
+  virtual uint GetLmId(const StringPiece& word) const = 0;
+
+  virtual bool IsKnownWordIndex(const lm::WordIndex& id) const = 0;
+
   virtual float ProbRule(jlong *begin, jlong *end, lm::ngram::ChartState& state) const = 0;
 
   virtual float ProbString(jint * const begin, jint * const end,
@@ -131,8 +140,12 @@ public:
   }
 
   float Prob(jint * const begin, jint * const end) const {
+    // map Joshua word ids to lm::WordIndexes
     MapArray(map_, begin, end);
+    return ProbForWordIndexArray(begin, end);
+  }
 
+  float ProbForWordIndexArray(jint * const begin, jint * const end) const {
     std::reverse(begin, end - 1);
     lm::ngram::State ignored;
     return m_.FullScoreForgotState(
@@ -141,6 +154,14 @@ public:
         ignored).prob;
   }
 
+  uint GetLmId(const StringPiece& word) const {
+    return m_.GetVocabulary().Index(word);
+  }
+
+  bool IsKnownWordIndex(const lm::WordIndex& id) const {
+      return id != m_.GetVocabulary().NotFound();
+  }
+
   float ProbRule(jlong * const begin, jlong * const end, lm::ngram::ChartState& state) const {
     if (begin == end) return 0.0;
     lm::ngram::RuleScore<Model> ruleScore(m_, state);
@@ -353,6 +374,36 @@ JNIEXPORT jfloat JNICALL Java_joshua_decoder_ff_lm_KenLM_prob(
       values + length);
 }
 
+JNIEXPORT jfloat JNICALL Java_joshua_decoder_ff_lm_KenLM_probForString(
+    JNIEnv *env, jclass, jlong pointer, jobjectArray arr) {
+  jint length = env->GetArrayLength(arr);
+  if (length <= 0)
+    return 0.0;
+  jint values[length];
+  const VirtualBase* lm_base = reinterpret_cast<const VirtualBase*>(pointer);
+  for (int i=0; i<length; i++) {
+      jstring word = (jstring) env->GetObjectArrayElement(arr, i);
+      const char *str = env->GetStringUTFChars(word, 0);
+      values[i] = lm_base->GetLmId(str);
+      env->ReleaseStringUTFChars(word, str);
+  }
+  return lm_base->ProbForWordIndexArray(values,
+      values + length);
+}
+
+JNIEXPORT jboolean JNICALL Java_joshua_decoder_ff_lm_KenLM_isKnownWord(
+    JNIEnv *env, jclass, jlong pointer, jstring word) {
+    const char *str = env->GetStringUTFChars(word, 0);
+    if (!str)
+      return false;
+    bool ret;
+    const VirtualBase* lm_base = reinterpret_cast<const VirtualBase*>(pointer);
+    lm::WordIndex id = lm_base->GetLmId(str);
+    ret = lm_base->IsKnownWordIndex(id);
+    env->ReleaseStringUTFChars(word, str);
+    return ret;
+}
+
 JNIEXPORT jfloat JNICALL Java_joshua_decoder_ff_lm_KenLM_probString(
     JNIEnv *env, jclass, jlong pointer, jintArray arr, jint start) {
   jint length = env->GetArrayLength(arr);

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/cc556006/src/joshua/decoder/ff/lm/KenLM.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/lm/KenLM.java b/src/joshua/decoder/ff/lm/KenLM.java
index be1f1e2..329b631 100644
--- a/src/joshua/decoder/ff/lm/KenLM.java
+++ b/src/joshua/decoder/ff/lm/KenLM.java
@@ -18,6 +18,7 @@
  */
 package joshua.decoder.ff.lm;
 
+import joshua.corpus.Vocabulary;
 import joshua.decoder.ff.lm.NGramLanguageModel;
 import joshua.decoder.ff.state_maintenance.KenLMState;
 
@@ -66,6 +67,10 @@ public class KenLM implements NGramLanguageModel, Comparable<KenLM> {
 
   private final static native float prob(long ptr, int words[]);
 
+  private final static native float probForString(long ptr, String[] words);
+
+  private final static native boolean isKnownWord(long ptr, String word);
+
   private final static native StateProbPair probRule(long ptr, long pool, long words[]);
   
   private final static native float estimateRule(long ptr, long words[]);
@@ -82,6 +87,16 @@ public class KenLM implements NGramLanguageModel, Comparable<KenLM> {
     N = order(pointer);
   }
 
+  /**
+   * Constructor if order is not known.
+   * Order will be inferred from the model.
+   */
+  public KenLM(String file_name) {
+    pointer = construct(file_name);
+    N = order(pointer);
+    ngramOrder = N;
+  }
+
   public void destroy() {
     destroy(pointer);
   }
@@ -94,10 +109,17 @@ public class KenLM implements NGramLanguageModel, Comparable<KenLM> {
     return registerWord(pointer, word, id);
   }
 
-  public float prob(int words[]) {
+  public float prob(int[] words) {
     return prob(pointer, words);
   }
 
+  /**
+   * Query for n-gram probability using strings.
+   */
+  public float prob(String[] words) {
+    return probForString(pointer, words);
+  }
+
   // Apparently Zhifei starts some array indices at 1. Change to 0-indexing.
   public float probString(int words[], int start) {
     return probString(pointer, words, start - 1);
@@ -146,6 +168,17 @@ public class KenLM implements NGramLanguageModel, Comparable<KenLM> {
     return estimate;
   }
 
+  /**
+   * The start symbol for a KenLM is the Vocabulary.START_SYM.
+   */
+  public String getStartSymbol() {
+    return Vocabulary.START_SYM;
+  }
+
+  public boolean isKnownWord(String word) {
+    return isKnownWord(pointer, word);
+  }
+
 
   /**
    * Inner class used to hold the results returned from KenLM with left-state minimization. Note

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/cc556006/tst/joshua/system/KenLmTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/system/KenLmTest.java b/tst/joshua/system/KenLmTest.java
index e185aaf..dba74fc 100644
--- a/tst/joshua/system/KenLmTest.java
+++ b/tst/joshua/system/KenLmTest.java
@@ -18,7 +18,9 @@
  */
  package joshua.system;
 
-import static org.junit.Assert.assertEquals;
+import static joshua.corpus.Vocabulary.registerLanguageModel;
+import static joshua.corpus.Vocabulary.unregisterLanguageModels;
+import static org.junit.Assert.*;
 import joshua.corpus.Vocabulary;
 import joshua.decoder.Decoder;
 import joshua.decoder.JoshuaConfiguration;
@@ -29,22 +31,23 @@ import org.junit.Before;
 import org.junit.Test;
 
 /**
- * Integration test for KenLM integration into Joshua This test will setup a
- * Joshua instance that loads libkenlm.so
- *
- * @author kellens
+ * KenLM JNI interface tests.
+ * Loads libken.{so,dylib}.
+ * If run in Eclipse, add -Djava.library.path=build/lib to JVM arguments
+ * of the run configuration.
  */
 public class KenLmTest {
 
+  private static final String LANGUAGE_MODEL_PATH = "resources/kenlm/oilers.kenlm";
+
   @Test
-  public void givenKenLmUsed_whenTranslationsCalled_thenVerifyJniWithSampleCall() {
+  public void givenKenLm_whenQueryingForNgramProbability_thenProbIsCorrect() {
     // GIVEN
-    String languageModelPath = "resources/kenlm/oilers.kenlm";
+    KenLM kenLm = new KenLM(3, LANGUAGE_MODEL_PATH);
+    int[] words = Vocabulary.addAll("Wayne Gretzky");
+    registerLanguageModel(kenLm);
 
     // WHEN
-    KenLM kenLm = new KenLM(3, languageModelPath);
-    Vocabulary.registerLanguageModel(kenLm);
-    int[] words = Vocabulary.addAll("Wayne Gretzky");
     float probability = kenLm.prob(words);
 
     // THEN
@@ -52,15 +55,41 @@ public class KenLmTest {
         Float.MIN_VALUE);
   }
   
+  @Test
+  public void givenKenLm_whenQueryingForNgramProbability_thenIdAndStringMethodsReturnTheSame() {
+    // GIVEN
+    KenLM kenLm = new KenLM(LANGUAGE_MODEL_PATH);
+    registerLanguageModel(kenLm);
+    String sentence = "Wayne Gretzky";
+    String[] words = sentence.split("\\s+");
+    int[] ids = Vocabulary.addAll(sentence);
+
+    // WHEN
+    float prob_string = kenLm.prob(words);
+    float prob_id = kenLm.prob(ids);
+
+    // THEN
+    assertEquals("ngram probabilities differ for word and id based n-gram query", prob_string, prob_id,
+            Float.MIN_VALUE);
+
+  }
+
+  @Test
+  public void givenKenLm_whenIsKnownWord_thenReturnValuesAreCorrect() {
+    KenLM kenLm = new KenLM(LANGUAGE_MODEL_PATH);
+    assertTrue(kenLm.isKnownWord("Wayne"));
+    assertFalse(kenLm.isKnownWord("Wayne2222"));
+  }
+
   @Before
   public void setUp() throws Exception {
     Vocabulary.clear();
-    Vocabulary.unregisterLanguageModels();
+    unregisterLanguageModels();
   }
-  
+
   @After
   public void tearDown() throws Exception {
     Vocabulary.clear();
-    Vocabulary.unregisterLanguageModels();
+    unregisterLanguageModels();
   }
 }