You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/29 06:47:04 UTC
[02/10] incubator-joshua git commit: Modified KenLM jni to support
querying the lm using strings not only ids. Also added a method to check
whether a word or id is known to the lm. Made output of regression tests more
concise
Modified KenLM jni to support querying the lm using strings not only ids. Also added a method to check whether a word or id is known to the lm.
Made output of regression tests more concise
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/cc556006
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/cc556006
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/cc556006
Branch: refs/heads/master
Commit: cc556006e646677f5c0e32755547b2657f59ff92
Parents: d25464c
Author: Felix Hieber <fh...@amazon.com>
Authored: Thu Dec 31 11:57:18 2015 +0100
Committer: Kellen Sunderland <ke...@amazon.com>
Committed: Thu Apr 28 15:42:05 2016 -0700
----------------------------------------------------------------------
jni/kenlm_wrap.cc | 57 ++++++++++++++++++++++++++++++--
src/joshua/decoder/ff/lm/KenLM.java | 35 +++++++++++++++++++-
tst/joshua/system/KenLmTest.java | 55 ++++++++++++++++++++++--------
3 files changed, 130 insertions(+), 17 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/cc556006/jni/kenlm_wrap.cc
----------------------------------------------------------------------
diff --git a/jni/kenlm_wrap.cc b/jni/kenlm_wrap.cc
index 6d66f37..16cb54b 100644
--- a/jni/kenlm_wrap.cc
+++ b/jni/kenlm_wrap.cc
@@ -12,7 +12,7 @@
#include <jni.h>
#include <pthread.h>
-// Grr. Everybody's compiler is slightly different and I'm trying to not depend on boost.
+// Grr. Everybody's compiler is slightly different and I'm trying to not depend on boost.
#include <unordered_map>
// Verify that jint and lm::ngram::WordIndex are the same size. If this breaks
@@ -55,7 +55,7 @@ struct Chart {
lm::ngram::ChartState* put(const lm::ngram::ChartState& state) {
uint64_t hashValue = lm::ngram::hash_value(state);
-
+
if (poolHash->find(hashValue) == poolHash->end()) {
lm::ngram::ChartState* pointer = (lm::ngram::ChartState *)pool->Allocate(sizeof(lm::ngram::ChartState));
*pointer = state;
@@ -67,7 +67,7 @@ struct Chart {
};
// Vocab ids above what the vocabulary knows about are unknown and should
-// be mapped to that.
+// be mapped to that.
void MapArray(const std::vector<lm::WordIndex>& map, jint *begin, jint *end) {
for (jint *i = begin; i < end; ++i) {
*i = map[*i];
@@ -88,8 +88,17 @@ public:
virtual ~VirtualBase() {
}
+ // compute/return n-gram probability for array of Joshua word ids
virtual float Prob(jint *begin, jint *end) const = 0;
+ // Compute/return n-gram probability for array of lm:WordIndexes
+ virtual float ProbForWordIndexArray(jint *begin, jint *end) const = 0;
+
+ // Returns the internal lm::WordIndex for a string
+ virtual uint GetLmId(const StringPiece& word) const = 0;
+
+ virtual bool IsKnownWordIndex(const lm::WordIndex& id) const = 0;
+
virtual float ProbRule(jlong *begin, jlong *end, lm::ngram::ChartState& state) const = 0;
virtual float ProbString(jint * const begin, jint * const end,
@@ -131,8 +140,12 @@ public:
}
float Prob(jint * const begin, jint * const end) const {
+ // map Joshua word ids to lm::WordIndexes
MapArray(map_, begin, end);
+ return ProbForWordIndexArray(begin, end);
+ }
+ float ProbForWordIndexArray(jint * const begin, jint * const end) const {
std::reverse(begin, end - 1);
lm::ngram::State ignored;
return m_.FullScoreForgotState(
@@ -141,6 +154,14 @@ public:
ignored).prob;
}
+ uint GetLmId(const StringPiece& word) const {
+ return m_.GetVocabulary().Index(word);
+ }
+
+ bool IsKnownWordIndex(const lm::WordIndex& id) const {
+ return id != m_.GetVocabulary().NotFound();
+ }
+
float ProbRule(jlong * const begin, jlong * const end, lm::ngram::ChartState& state) const {
if (begin == end) return 0.0;
lm::ngram::RuleScore<Model> ruleScore(m_, state);
@@ -353,6 +374,36 @@ JNIEXPORT jfloat JNICALL Java_joshua_decoder_ff_lm_KenLM_prob(
values + length);
}
+JNIEXPORT jfloat JNICALL Java_joshua_decoder_ff_lm_KenLM_probForString(
+ JNIEnv *env, jclass, jlong pointer, jobjectArray arr) {
+ jint length = env->GetArrayLength(arr);
+ if (length <= 0)
+ return 0.0;
+ jint values[length];
+ const VirtualBase* lm_base = reinterpret_cast<const VirtualBase*>(pointer);
+ for (int i=0; i<length; i++) {
+ jstring word = (jstring) env->GetObjectArrayElement(arr, i);
+ const char *str = env->GetStringUTFChars(word, 0);
+ values[i] = lm_base->GetLmId(str);
+ env->ReleaseStringUTFChars(word, str);
+ }
+ return lm_base->ProbForWordIndexArray(values,
+ values + length);
+}
+
+JNIEXPORT jboolean JNICALL Java_joshua_decoder_ff_lm_KenLM_isKnownWord(
+ JNIEnv *env, jclass, jlong pointer, jstring word) {
+ const char *str = env->GetStringUTFChars(word, 0);
+ if (!str)
+ return false;
+ bool ret;
+ const VirtualBase* lm_base = reinterpret_cast<const VirtualBase*>(pointer);
+ lm::WordIndex id = lm_base->GetLmId(str);
+ ret = lm_base->IsKnownWordIndex(id);
+ env->ReleaseStringUTFChars(word, str);
+ return ret;
+}
+
JNIEXPORT jfloat JNICALL Java_joshua_decoder_ff_lm_KenLM_probString(
JNIEnv *env, jclass, jlong pointer, jintArray arr, jint start) {
jint length = env->GetArrayLength(arr);
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/cc556006/src/joshua/decoder/ff/lm/KenLM.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/lm/KenLM.java b/src/joshua/decoder/ff/lm/KenLM.java
index be1f1e2..329b631 100644
--- a/src/joshua/decoder/ff/lm/KenLM.java
+++ b/src/joshua/decoder/ff/lm/KenLM.java
@@ -18,6 +18,7 @@
*/
package joshua.decoder.ff.lm;
+import joshua.corpus.Vocabulary;
import joshua.decoder.ff.lm.NGramLanguageModel;
import joshua.decoder.ff.state_maintenance.KenLMState;
@@ -66,6 +67,10 @@ public class KenLM implements NGramLanguageModel, Comparable<KenLM> {
private final static native float prob(long ptr, int words[]);
+ private final static native float probForString(long ptr, String[] words);
+
+ private final static native boolean isKnownWord(long ptr, String word);
+
private final static native StateProbPair probRule(long ptr, long pool, long words[]);
private final static native float estimateRule(long ptr, long words[]);
@@ -82,6 +87,16 @@ public class KenLM implements NGramLanguageModel, Comparable<KenLM> {
N = order(pointer);
}
+ /**
+ * Constructor if order is not known.
+ * Order will be inferred from the model.
+ */
+ public KenLM(String file_name) {
+ pointer = construct(file_name);
+ N = order(pointer);
+ ngramOrder = N;
+ }
+
public void destroy() {
destroy(pointer);
}
@@ -94,10 +109,17 @@ public class KenLM implements NGramLanguageModel, Comparable<KenLM> {
return registerWord(pointer, word, id);
}
- public float prob(int words[]) {
+ public float prob(int[] words) {
return prob(pointer, words);
}
+ /**
+ * Query for n-gram probability using strings.
+ */
+ public float prob(String[] words) {
+ return probForString(pointer, words);
+ }
+
// Apparently Zhifei starts some array indices at 1. Change to 0-indexing.
public float probString(int words[], int start) {
return probString(pointer, words, start - 1);
@@ -146,6 +168,17 @@ public class KenLM implements NGramLanguageModel, Comparable<KenLM> {
return estimate;
}
+ /**
+ * The start symbol for a KenLM is the Vocabulary.START_SYM.
+ */
+ public String getStartSymbol() {
+ return Vocabulary.START_SYM;
+ }
+
+ public boolean isKnownWord(String word) {
+ return isKnownWord(pointer, word);
+ }
+
/**
* Inner class used to hold the results returned from KenLM with left-state minimization. Note
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/cc556006/tst/joshua/system/KenLmTest.java
----------------------------------------------------------------------
diff --git a/tst/joshua/system/KenLmTest.java b/tst/joshua/system/KenLmTest.java
index e185aaf..dba74fc 100644
--- a/tst/joshua/system/KenLmTest.java
+++ b/tst/joshua/system/KenLmTest.java
@@ -18,7 +18,9 @@
*/
package joshua.system;
-import static org.junit.Assert.assertEquals;
+import static joshua.corpus.Vocabulary.registerLanguageModel;
+import static joshua.corpus.Vocabulary.unregisterLanguageModels;
+import static org.junit.Assert.*;
import joshua.corpus.Vocabulary;
import joshua.decoder.Decoder;
import joshua.decoder.JoshuaConfiguration;
@@ -29,22 +31,23 @@ import org.junit.Before;
import org.junit.Test;
/**
- * Integration test for KenLM integration into Joshua This test will setup a
- * Joshua instance that loads libkenlm.so
- *
- * @author kellens
+ * KenLM JNI interface tests.
+ * Loads libken.{so,dylib}.
+ * If run in Eclipse, add -Djava.library.path=build/lib to JVM arguments
+ * of the run configuration.
*/
public class KenLmTest {
+ private static final String LANGUAGE_MODEL_PATH = "resources/kenlm/oilers.kenlm";
+
@Test
- public void givenKenLmUsed_whenTranslationsCalled_thenVerifyJniWithSampleCall() {
+ public void givenKenLm_whenQueryingForNgramProbability_thenProbIsCorrect() {
// GIVEN
- String languageModelPath = "resources/kenlm/oilers.kenlm";
+ KenLM kenLm = new KenLM(3, LANGUAGE_MODEL_PATH);
+ int[] words = Vocabulary.addAll("Wayne Gretzky");
+ registerLanguageModel(kenLm);
// WHEN
- KenLM kenLm = new KenLM(3, languageModelPath);
- Vocabulary.registerLanguageModel(kenLm);
- int[] words = Vocabulary.addAll("Wayne Gretzky");
float probability = kenLm.prob(words);
// THEN
@@ -52,15 +55,41 @@ public class KenLmTest {
Float.MIN_VALUE);
}
+ @Test
+ public void givenKenLm_whenQueryingForNgramProbability_thenIdAndStringMethodsReturnTheSame() {
+ // GIVEN
+ KenLM kenLm = new KenLM(LANGUAGE_MODEL_PATH);
+ registerLanguageModel(kenLm);
+ String sentence = "Wayne Gretzky";
+ String[] words = sentence.split("\\s+");
+ int[] ids = Vocabulary.addAll(sentence);
+
+ // WHEN
+ float prob_string = kenLm.prob(words);
+ float prob_id = kenLm.prob(ids);
+
+ // THEN
+ assertEquals("ngram probabilities differ for word and id based n-gram query", prob_string, prob_id,
+ Float.MIN_VALUE);
+
+ }
+
+ @Test
+ public void givenKenLm_whenIsKnownWord_thenReturnValuesAreCorrect() {
+ KenLM kenLm = new KenLM(LANGUAGE_MODEL_PATH);
+ assertTrue(kenLm.isKnownWord("Wayne"));
+ assertFalse(kenLm.isKnownWord("Wayne2222"));
+ }
+
@Before
public void setUp() throws Exception {
Vocabulary.clear();
- Vocabulary.unregisterLanguageModels();
+ unregisterLanguageModels();
}
-
+
@After
public void tearDown() throws Exception {
Vocabulary.clear();
- Vocabulary.unregisterLanguageModels();
+ unregisterLanguageModels();
}
}