You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/09/13 15:48:50 UTC
[1/7] incubator-joshua git commit: Probably won't compile but gets
the idea across
Repository: incubator-joshua
Updated Branches:
refs/heads/master 49bbcac01 -> 90fff5ab1
Probably won't compile but gets the idea across
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/5e954752
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/5e954752
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/5e954752
Branch: refs/heads/master
Commit: 5e9547526ad4bc15f48e665608897def552cb9ab
Parents: c3e7a15
Author: Kenneth Heafield <gi...@kheafield.com>
Authored: Tue Sep 13 10:58:26 2016 +0200
Committer: Kenneth Heafield <gi...@kheafield.com>
Committed: Tue Sep 13 10:58:26 2016 +0200
----------------------------------------------------------------------
jni/kenlm_wrap.cc | 142 ++++++++++++++++++++++---------------------------
1 file changed, 63 insertions(+), 79 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5e954752/jni/kenlm_wrap.cc
----------------------------------------------------------------------
diff --git a/jni/kenlm_wrap.cc b/jni/kenlm_wrap.cc
index 11d9c28..8f69e19 100644
--- a/jni/kenlm_wrap.cc
+++ b/jni/kenlm_wrap.cc
@@ -20,7 +20,6 @@
#include "lm/left.hh"
#include "lm/state.hh"
#include "util/murmur_hash.hh"
-#include "util/pool.hh"
#include <iostream>
@@ -30,7 +29,8 @@
#include <pthread.h>
// Grr. Everybody's compiler is slightly different and I'm trying to not depend on boost.
-#include <unordered_map>
+#include <unordered_set>
+#include <vector>
// Verify that jint and lm::ngram::WordIndex are the same size. If this breaks
// for you, there's a need to revise probString.
@@ -45,7 +45,35 @@ template<> struct StaticCheck<true> {
typedef StaticCheck<sizeof(jint) == sizeof(lm::WordIndex)>::StaticAssertionPassed FloatSize;
-typedef std::unordered_multimap<uint64_t, lm::ngram::ChartState*> PoolHash;
+// Could be uint64_t if you wanted to have 33-bit support.
+typedef uint32_t StateIndex;
+typedef std::vector<lm::ngram::ChartState> StateVector;
+
+class HashIndex : public std::unary_function<StateIndex, uint64_t> {
+ public:
+ explicit HashIndex(const StateVector &vec) : vec_(vec) {}
+
+ uint64_t operator()(StateIndex index) const {
+ return hash_value(vec_[index]);
+ }
+
+ private:
+ const StateVector &vec_;
+};
+
+class EqualIndex : public std::binary_function<StateIndex, StateIndex, bool> {
+ public:
+ explicit EqualIndex(const StateVector &vec) : vec_(vec) {}
+
+ bool operator()(StateIndex first, StateIndex second) const {
+ return vec_[first] == vec_[second];
+ }
+
+ private:
+ const StateVector &vec_;
+};
+
+typedef std::unordered_set<StateIndex, HashIndex, EqualIndex> Lookup;
/**
* A Chart bundles together a unordered_multimap that maps ChartState signatures to a single
@@ -54,46 +82,26 @@ typedef std::unordered_multimap<uint64_t, lm::ngram::ChartState*> PoolHash;
* across KenLMs for the same sentence. Multimap is used to avoid hash collisions which can
* return incorrect results, and cause out-of-bounds lookups when multiple KenLMs are in use.
*/
-struct Chart {
- // A cache for allocated chart objects
- PoolHash* poolHash;
- // Pool used to allocate new ones
- util::Pool* pool;
-
- Chart() {
- poolHash = new PoolHash();
- pool = new util::Pool();
- }
-
- ~Chart() {
- delete poolHash;
- pool->FreeAll();
- delete pool;
- }
-
- lm::ngram::ChartState* put(const lm::ngram::ChartState& state) {
- lm::ngram::ChartState* state_ptr = nullptr;
- uint64_t hashValue = lm::ngram::hash_value(state);
- auto state_it = poolHash->find(hashValue);
-
- // Try to retrieve a matching ChartState pointer from our Pool
- while(state_it != poolHash->end()) {
- if (state == *(state_it->second)) {
- state_ptr = state_it->second;
- break;
+class Chart {
+ public:
+ Chart() : lookup_(1000, HashIndex(vec_), EqualIndex(vec_)) {}
+
+ StateIndex Intern(const lm::ngram::ChartState &state) {
+ vec_.push_back(state);
+ std::pair<Lookup::iterator, bool> ins(lookup_.insert(vec_.size() - 1));
+ if (!ins.second) {
+ vec_.pop_back();
}
- state_it++;
+ return *ins.first;
}
- // Unable to find this ChartState in our pool, allocate new space for it
- if (!state_ptr) {
- state_ptr = (lm::ngram::ChartState *) pool->Allocate(sizeof(lm::ngram::ChartState));
- *state_ptr = state;
- (*poolHash).insert({hashValue, state_ptr});
+ const ChartState &InterpretState(StateIndex index) const {
+ return vec_[index];
}
- return state_ptr;
- }
+ private:
+ StateVector vec_;
+ Lookup lookup_;
};
// Vocab ids above what the vocabulary knows about are unknown and should
@@ -131,7 +139,7 @@ public:
virtual bool IsKnownWordIndex(const lm::WordIndex& id) const = 0;
- virtual float ProbRule(jlong *begin, jlong *end, lm::ngram::ChartState& state) const = 0;
+ virtual float ProbRule(jlong *begin, jlong *end, lm::ngram::ChartState& state, const Chart &chart) const = 0;
virtual float ProbString(jint * const begin, jint * const end,
jint start) const = 0;
@@ -142,22 +150,9 @@ public:
virtual bool RegisterWord(const StringPiece& word, const int joshua_id) = 0;
- void RememberReturnMethod(jclass chart_pair, jmethodID chart_pair_init) {
- chart_pair_ = chart_pair;
- chart_pair_init_ = chart_pair_init;
- }
-
- jclass ChartPair() const { return chart_pair_; }
- jmethodID ChartPairInit() const { return chart_pair_init_; }
-
protected:
VirtualBase() {
}
-
-private:
- // Hack: these are remembered so we can avoid looking them up every time.
- jclass chart_pair_;
- jmethodID chart_pair_init_;
};
template<class Model> class VirtualImpl: public VirtualBase {
@@ -201,12 +196,12 @@ public:
return id != m_.GetVocabulary().NotFound();
}
- float ProbRule(jlong * const begin, jlong * const end, lm::ngram::ChartState& state) const {
+ float ProbRule(jlong * const begin, jlong * const end, lm::ngram::ChartState& state, const Chart &chart) const {
if (begin == end) return 0.0;
lm::ngram::RuleScore<Model> ruleScore(m_, state);
if (*begin < 0) {
- ruleScore.BeginNonTerminal(*reinterpret_cast<const lm::ngram::ChartState*>(-*begin));
+ ruleScore.BeginNonTerminal(chart.Interpet(-*begin));
} else {
const lm::WordIndex word = map_[*begin];
if (word == m_.GetVocabulary().BeginSentence()) {
@@ -218,7 +213,7 @@ public:
for (jlong* i = begin + 1; i != end; i++) {
long word = *i;
if (word < 0)
- ruleScore.NonTerminal(*reinterpret_cast<const lm::ngram::ChartState*>(-word));
+ ruleScore.NonTerminal(chart.Interpret(-word));
else
ruleScore.Terminal(map_[word]);
}
@@ -341,18 +336,6 @@ JNIEXPORT jlong JNICALL Java_org_apache_joshua_decoder_ff_lm_KenLM_construct(
VirtualBase *ret;
try {
ret = ConstructModel(str);
-
- // Get a class reference for the type pair that char
- jclass local_chart_pair = env->FindClass("org/apache/joshua/decoder/ff/lm/KenLM$StateProbPair");
- UTIL_THROW_IF(!local_chart_pair, util::Exception, "Failed to find org/apache/joshua/decoder/ff/lm/KenLM$StateProbPair");
- jclass chart_pair = (jclass)env->NewGlobalRef(local_chart_pair);
- env->DeleteLocalRef(local_chart_pair);
-
- // Get the Method ID of the constructor which takes an int
- jmethodID chart_pair_init = env->GetMethodID(chart_pair, "<init>", "(JF)V");
- UTIL_THROW_IF(!chart_pair_init, util::Exception, "Failed to find init method");
-
- ret->RememberReturnMethod(chart_pair, chart_pair_init);
} catch (std::exception &e) {
std::cerr << e.what() << std::endl;
abort();
@@ -363,20 +346,17 @@ JNIEXPORT jlong JNICALL Java_org_apache_joshua_decoder_ff_lm_KenLM_construct(
JNIEXPORT void JNICALL Java_org_apache_joshua_decoder_ff_lm_KenLM_destroy(
JNIEnv *env, jclass, jlong pointer) {
- VirtualBase *base = reinterpret_cast<VirtualBase*>(pointer);
- env->DeleteGlobalRef(base->ChartPair());
- delete base;
+ delete reinterpret_cast<VirtualBase*>(pointer);
}
-JNIEXPORT long JNICALL Java_org_apache_joshua_decoder_ff_lm_KenLM_createPool(
+JNIEXPORT jlong JNICALL Java_org_apache_joshua_decoder_ff_lm_KenLM_createPool(
JNIEnv *env, jclass) {
return reinterpret_cast<long>(new Chart());
}
JNIEXPORT void JNICALL Java_org_apache_joshua_decoder_ff_lm_KenLM_destroyPool(
JNIEnv *env, jclass, jlong pointer) {
- Chart* chart = reinterpret_cast<Chart*>(pointer);
- delete chart;
+ delete reinterpret_cast<Chart*>(pointer);
}
JNIEXPORT jint JNICALL Java_org_apache_joshua_decoder_ff_lm_KenLM_order(
@@ -462,7 +442,12 @@ JNIEXPORT jfloat JNICALL Java_org_apache_joshua_decoder_ff_lm_KenLM_probString(
values + length, start);
}
-JNIEXPORT jobject JNICALL Java_org_apache_joshua_decoder_ff_lm_KenLM_probRule(
+union FloatConverter {
+ float f;
+ uint32_t i;
+};
+
+JNIEXPORT jlong JNICALL Java_org_apache_joshua_decoder_ff_lm_KenLM_probRule(
JNIEnv *env, jclass, jlong pointer, jlong chartPtr, jlongArray arr) {
jint length = env->GetArrayLength(arr);
// GCC only.
@@ -472,13 +457,12 @@ JNIEXPORT jobject JNICALL Java_org_apache_joshua_decoder_ff_lm_KenLM_probRule(
// Compute the probability
lm::ngram::ChartState outState;
const VirtualBase *base = reinterpret_cast<const VirtualBase*>(pointer);
- float prob = base->ProbRule(values, values + length, outState);
-
Chart* chart = reinterpret_cast<Chart*>(chartPtr);
- lm::ngram::ChartState* outStatePtr = chart->put(outState);
+ FloatConvert prob;
+ prob.f = base->ProbRule(values, values + length, outState, *chart);
- // Call back constructor to allocate a new instance, with an int argument
- return env->NewObject(base->ChartPair(), base->ChartPairInit(), (long)outStatePtr, prob);
+ StateIndex index = chart->Intern(outState);
+ return static_cast<uint64_t>(index) << 32 | static_cast<uint64_t>(prob.i);
}
JNIEXPORT jfloat JNICALL Java_org_apache_joshua_decoder_ff_lm_KenLM_estimateRule(
[2/7] incubator-joshua git commit: Merge branch 'master' of
https://github.com/KellenSunderland/incubator-joshua
Posted by mj...@apache.org.
Merge branch 'master' of https://github.com/KellenSunderland/incubator-joshua
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/929760a3
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/929760a3
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/929760a3
Branch: refs/heads/master
Commit: 929760a35dda5f88792c44d6eef41f3e58cf7250
Parents: 3b5811a 5e95475
Author: Kellen Sunderland <ke...@amazon.com>
Authored: Tue Sep 13 11:23:42 2016 +0200
Committer: Kellen Sunderland <ke...@amazon.com>
Committed: Tue Sep 13 11:23:42 2016 +0200
----------------------------------------------------------------------
jni/kenlm_wrap.cc | 142 ++++++++++++++++++++++---------------------------
1 file changed, 63 insertions(+), 79 deletions(-)
----------------------------------------------------------------------
[6/7] incubator-joshua git commit: Make ChartState start at index 1.
Posted by mj...@apache.org.
Make ChartState start at index 1.
Fixes bug with state 0 which was getting confused for the vocab id 0 aka <unk>.
The sign bit distinguishes a word from a ChartState id.
Written by @kpu on Kellen's laptop.
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/bdd670bd
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/bdd670bd
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/bdd670bd
Branch: refs/heads/master
Commit: bdd670bd0554a73c0de0db5383e07ce5e8df586f
Parents: 0252942
Author: Kellen Sunderland <ke...@amazon.com>
Authored: Tue Sep 13 17:28:51 2016 +0200
Committer: Kellen Sunderland <ke...@amazon.com>
Committed: Tue Sep 13 17:46:50 2016 +0200
----------------------------------------------------------------------
jni/kenlm_wrap.cc | 5 +++--
src/test/java/org/apache/joshua/system/KenLmTest.java | 3 +--
2 files changed, 4 insertions(+), 4 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/bdd670bd/jni/kenlm_wrap.cc
----------------------------------------------------------------------
diff --git a/jni/kenlm_wrap.cc b/jni/kenlm_wrap.cc
index bbe6e7c..bd82fe4 100644
--- a/jni/kenlm_wrap.cc
+++ b/jni/kenlm_wrap.cc
@@ -92,11 +92,12 @@ class Chart {
if (!ins.second) {
vec_.pop_back();
}
- return *ins.first;
+ return *ins.first + 1; // +1 so that the first id is 1, not 0. We use sign bit to
+ // distinguish ChartState from vocab id.
}
const lm::ngram::ChartState &InterpretState(StateIndex index) const {
- return vec_[index];
+ return vec_[index - 1];
}
private:
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/bdd670bd/src/test/java/org/apache/joshua/system/KenLmTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/joshua/system/KenLmTest.java b/src/test/java/org/apache/joshua/system/KenLmTest.java
index 003b5d9..9f26f8f 100644
--- a/src/test/java/org/apache/joshua/system/KenLmTest.java
+++ b/src/test/java/org/apache/joshua/system/KenLmTest.java
@@ -80,7 +80,6 @@ public class KenLmTest {
// THEN
assertEquals("ngram probabilities differ for word and id based n-gram query", prob_string, prob_id,
Float.MIN_VALUE);
-
}
@Test
@@ -106,7 +105,7 @@ public class KenLmTest {
// THEN
assertThat(result, is(notNullValue()));
- assertThat(result.state.getState(), is(0L));
+ assertThat(result.state.getState(), is(1L));
assertThat(result.prob, is(-3.7906885f));
}
[4/7] incubator-joshua git commit: Manage pool of states on a per LM,
per sentence basis
Posted by mj...@apache.org.
Manage pool of states on a per LM, per sentence basis
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/0252942d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/0252942d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/0252942d
Branch: refs/heads/master
Commit: 0252942dafc1679f2c5d6b8d6da7cd6884ca40c3
Parents: 4e07bb6
Author: Kellen Sunderland <ke...@amazon.com>
Authored: Tue Sep 13 13:58:05 2016 +0200
Committer: Kellen Sunderland <ke...@amazon.com>
Committed: Tue Sep 13 15:59:46 2016 +0200
----------------------------------------------------------------------
.../org/apache/joshua/decoder/KenLMPool.java | 42 ++++++++++++++++++++
.../decoder/LanguageModelStateManager.java | 29 ++++++++++++++
.../org/apache/joshua/decoder/Translation.java | 17 +-------
.../org/apache/joshua/decoder/ff/lm/KenLM.java | 25 +++++-------
.../ff/lm/StateMinimizingLanguageModel.java | 30 ++++----------
.../joshua/decoder/segment_file/Sentence.java | 11 +++++
.../org/apache/joshua/system/KenLmTest.java | 16 ++++----
7 files changed, 109 insertions(+), 61 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/0252942d/src/main/java/org/apache/joshua/decoder/KenLMPool.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/KenLMPool.java b/src/main/java/org/apache/joshua/decoder/KenLMPool.java
new file mode 100644
index 0000000..378ac51
--- /dev/null
+++ b/src/main/java/org/apache/joshua/decoder/KenLMPool.java
@@ -0,0 +1,42 @@
+package org.apache.joshua.decoder;
+
+import org.apache.joshua.decoder.ff.lm.KenLM;
+
+/**
+ * Class to wrap a KenLM pool of states. This class is not ThreadSafe. It should be
+ * used in a scoped context, and close must be called to release native resources. It
+ * does implement a custom finalizer that will release these resources if needed, but
+ * this should not be relied on.
+ *
+ * @author Kellen Sunderland
+ */
+
+public class KenLMPool implements AutoCloseable {
+
+ private final long pool;
+ private final KenLM languageModel;
+ private boolean released = false;
+
+ public KenLMPool(long pool, KenLM languageModel) {
+ this.pool = pool;
+ this.languageModel = languageModel;
+ }
+
+ public long getPool() {
+ return pool;
+ }
+
+ @Override
+ protected void finalize() throws Throwable {
+ close();
+ super.finalize();
+ }
+
+ @Override
+ public void close() {
+ if (!released) {
+ released = true;
+ languageModel.destroyLMPool(pool);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/0252942d/src/main/java/org/apache/joshua/decoder/LanguageModelStateManager.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/LanguageModelStateManager.java b/src/main/java/org/apache/joshua/decoder/LanguageModelStateManager.java
new file mode 100644
index 0000000..6a3c4b3
--- /dev/null
+++ b/src/main/java/org/apache/joshua/decoder/LanguageModelStateManager.java
@@ -0,0 +1,29 @@
+package org.apache.joshua.decoder;
+
+import org.apache.joshua.decoder.ff.lm.KenLM;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.UUID;
+
+/**
+ * @author Kellen Sunderland
+ */
+public class LanguageModelStateManager {
+
+ private Map<UUID, KenLMPool> languageModelPoolMapping = new HashMap<>();
+
+ public KenLMPool getStatePool(UUID languageModelId, KenLM languageModel) {
+ KenLMPool statePool = languageModelPoolMapping.get(languageModelId);
+ if (statePool == null) {
+ statePool = languageModel.createLMPool();
+ languageModelPoolMapping.put(languageModelId, statePool);
+ }
+ return statePool;
+ }
+
+ public void clearStatePool() {
+ languageModelPoolMapping.values().forEach(KenLMPool::close);
+ languageModelPoolMapping.clear();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/0252942d/src/main/java/org/apache/joshua/decoder/Translation.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/Translation.java b/src/main/java/org/apache/joshua/decoder/Translation.java
index ade9b22..ff2aed0 100644
--- a/src/main/java/org/apache/joshua/decoder/Translation.java
+++ b/src/main/java/org/apache/joshua/decoder/Translation.java
@@ -182,8 +182,8 @@ public class Translation {
}
- // remove state from StateMinimizingLanguageModel instances in features.
- destroyKenLMStates(featureFunctions);
+ // Force any StateMinimizingLanguageModel pool mappings to be cleaned
+ source.getStateManager().clearStatePool();
}
@@ -224,17 +224,4 @@ public class Translation {
}
return structuredTranslations;
}
-
- /**
- * KenLM hack. If using KenLMFF, we need to tell KenLM to delete the pool used to create chart
- * objects for this sentence.
- */
- private void destroyKenLMStates(final List<FeatureFunction> featureFunctions) {
- for (FeatureFunction feature : featureFunctions) {
- if (feature instanceof StateMinimizingLanguageModel) {
- ((StateMinimizingLanguageModel) feature).destroyPool(getSourceSentence().id());
- break;
- }
- }
- }
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/0252942d/src/main/java/org/apache/joshua/decoder/ff/lm/KenLM.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/lm/KenLM.java b/src/main/java/org/apache/joshua/decoder/ff/lm/KenLM.java
index b0a1117..0646f68 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/lm/KenLM.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/lm/KenLM.java
@@ -19,6 +19,7 @@
package org.apache.joshua.decoder.ff.lm;
import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.KenLMPool;
import org.apache.joshua.decoder.ff.state_maintenance.KenLMState;
import org.apache.joshua.util.FormatUtils;
import org.slf4j.Logger;
@@ -105,8 +106,8 @@ public class KenLM implements NGramLanguageModel, Comparable<KenLM> {
}
}
- public long createLMPool() {
- return createPool();
+ public KenLMPool createLMPool() {
+ return new KenLMPool(createPool(), this);
}
public void destroyLMPool(long pointer) {
@@ -153,24 +154,16 @@ public class KenLM implements NGramLanguageModel, Comparable<KenLM> {
* state and the LM probability incurred along this rule.
*
* @param words array of words
- * @param poolPointer todo
+ * @param poolWrapper an object that wraps a pool reference returned from KenLM createPool
* @return the updated {@link org.apache.joshua.decoder.ff.lm.KenLM.StateProbPair} e.g.
* KenLM state and the LM probability incurred along this rule
*/
- public StateProbPair probRule(long[] words, long poolPointer) {
+ public StateProbPair probRule(long[] words, KenLMPool poolWrapper) {
+ long packedResult = probRule(pointer, poolWrapper.getPool(), words);
+ int state = (int) (packedResult >> 32);
+ float probVal = Float.intBitsToFloat((int)packedResult);
- StateProbPair pair = null;
- try {
- long packedResult = probRule(pointer, poolPointer, words);
- int state = (int) (packedResult >> 32);
- float probVal = Float.intBitsToFloat((int)packedResult);
- pair = new StateProbPair(state, probVal);
- } catch (NoSuchMethodError e) {
- e.printStackTrace();
- System.exit(1);
- }
-
- return pair;
+ return new StateProbPair(state, probVal);
}
/**
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/0252942d/src/main/java/org/apache/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java b/src/main/java/org/apache/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
index 4bec379..2219ce8 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
@@ -21,10 +21,11 @@ package org.apache.joshua.decoder.ff.lm;
import static org.apache.joshua.util.FormatUtils.isNonterminal;
import java.util.List;
-import java.util.concurrent.ConcurrentHashMap;
+import java.util.UUID;
import org.apache.joshua.corpus.Vocabulary;
import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.KenLMPool;
import org.apache.joshua.decoder.chart_parser.SourcePath;
import org.apache.joshua.decoder.ff.FeatureVector;
import org.apache.joshua.decoder.ff.lm.KenLM.StateProbPair;
@@ -42,9 +43,6 @@ import org.apache.joshua.decoder.segment_file.Sentence;
*/
public class StateMinimizingLanguageModel extends LanguageModelFF {
- // maps from sentence numbers to KenLM-side pools used to allocate state
- private static final ConcurrentHashMap<Integer, Long> poolMap = new ConcurrentHashMap<>();
-
public StateMinimizingLanguageModel(FeatureVector weights, String[] args, JoshuaConfiguration config) {
super(weights, args, config);
this.type = "kenlm";
@@ -87,6 +85,8 @@ public class StateMinimizingLanguageModel extends LanguageModelFF {
return lmCost + oovCost;
}
+ private UUID languageModelPoolId = UUID.randomUUID();
+
/**
* Computes the features incurred along this edge. Note that these features are unweighted costs
* of the feature; they are the feature cost, not the model cost, or the inner product of them.
@@ -115,14 +115,11 @@ public class StateMinimizingLanguageModel extends LanguageModelFF {
// map to ken lm ids
final long[] words = mapToKenLmIds(ruleWords, tailNodes, false);
- final int sentID = sentence.id();
- // Since sentId is unique across threads, next operations are safe, but not atomic!
- if (!poolMap.containsKey(sentID)) {
- poolMap.put(sentID, ((KenLM) languageModel).createLMPool());
- }
+ KenLMPool statePool = sentence.getStateManager().getStatePool(languageModelPoolId, (KenLM)
+ languageModel);
// Get the probability of applying the rule and the new state
- final StateProbPair pair = ((KenLM) languageModel).probRule(words, poolMap.get(sentID));
+ final StateProbPair pair = ((KenLM) languageModel).probRule(words, statePool);
// Record the prob
acc.add(denseFeatureIndex, pair.prob);
@@ -162,19 +159,6 @@ public class StateMinimizingLanguageModel extends LanguageModelFF {
}
/**
- * Destroys the pool created to allocate state for this sentence. Called from the
- * {@link org.apache.joshua.decoder.Translation} class after outputting the sentence or k-best list. Hosting
- * this map here in KenLMFF statically allows pools to be shared across KenLM instances.
- *
- * @param sentId a key in the poolmap table to destroy
- */
- public void destroyPool(int sentId) {
- if (poolMap.containsKey(sentId))
- ((KenLM) languageModel).destroyLMPool(poolMap.get(sentId));
- poolMap.remove(sentId);
- }
-
- /**
* This function differs from regular transitions because we incorporate the cost of incomplete
* left-hand ngrams, as well as including the start- and end-of-sentence markers (if they were
* requested when the object was created).
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/0252942d/src/main/java/org/apache/joshua/decoder/segment_file/Sentence.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/segment_file/Sentence.java b/src/main/java/org/apache/joshua/decoder/segment_file/Sentence.java
index 7127870..f84c41a 100644
--- a/src/main/java/org/apache/joshua/decoder/segment_file/Sentence.java
+++ b/src/main/java/org/apache/joshua/decoder/segment_file/Sentence.java
@@ -21,16 +21,21 @@ package org.apache.joshua.decoder.segment_file;
import static org.apache.joshua.util.FormatUtils.addSentenceMarkers;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
+import java.util.Map;
import java.util.StringTokenizer;
+import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.joshua.corpus.Vocabulary;
import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.KenLMPool;
+import org.apache.joshua.decoder.LanguageModelStateManager;
import org.apache.joshua.decoder.ff.tm.Grammar;
import org.apache.joshua.lattice.Arc;
import org.apache.joshua.lattice.Lattice;
@@ -77,6 +82,8 @@ public class Sentence {
public JoshuaConfiguration config = null;
+ private LanguageModelStateManager stateManager = new LanguageModelStateManager();
+
/**
* Constructor. Receives a string representing the input sentence. This string may be a
* string-encoded lattice or a plain text string for decoding.
@@ -447,4 +454,8 @@ public class Sentence {
public Node<Token> getNode(int i) {
return getLattice().getNode(i);
}
+
+ public LanguageModelStateManager getStateManager() {
+ return stateManager;
+ }
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/0252942d/src/test/java/org/apache/joshua/system/KenLmTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/joshua/system/KenLmTest.java b/src/test/java/org/apache/joshua/system/KenLmTest.java
index aa396d2..003b5d9 100644
--- a/src/test/java/org/apache/joshua/system/KenLmTest.java
+++ b/src/test/java/org/apache/joshua/system/KenLmTest.java
@@ -19,6 +19,7 @@
package org.apache.joshua.system;
import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.KenLMPool;
import org.apache.joshua.decoder.ff.lm.KenLM;
import org.apache.joshua.util.io.KenLmTestUtil;
import org.testng.annotations.AfterMethod;
@@ -29,8 +30,7 @@ import static org.apache.joshua.corpus.Vocabulary.registerLanguageModel;
import static org.apache.joshua.corpus.Vocabulary.unregisterLanguageModels;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.core.Is.is;
-import static org.mockito.Matchers.isNotNull;
-import static org.mockito.Matchers.notNull;
+import static org.hamcrest.core.IsNull.notNullValue;
import static org.testng.Assert.assertTrue;
import static org.testng.AssertJUnit.assertEquals;
import static org.testng.AssertJUnit.assertFalse;
@@ -84,7 +84,7 @@ public class KenLmTest {
}
@Test
- public void givenKenLm_whenQueryingForNgramProbability2_thenIdAndStringMethodsReturnTheSame() {
+ public void givenKenLm_whenQueryingWithState_thenStateAndProbReturned() {
// GIVEN
KenLmTestUtil.Guard(() -> kenLm = new KenLM(LANGUAGE_MODEL_PATH));
@@ -94,16 +94,18 @@ public class KenLmTest {
int[] ids = Vocabulary.addAll(sentence);
long[] longIds = new long[ids.length];
- for(int i = 0; i< words.length; i++) {
+ for (int i = 0; i < words.length; i++) {
longIds[i] = ids[i];
}
// WHEN
- long poolPointer = kenLm.createLMPool();
- KenLM.StateProbPair result = kenLm.probRule(longIds, poolPointer);
- kenLm.destroyLMPool(poolPointer);
+ KenLM.StateProbPair result;
+ try (KenLMPool poolPointer = kenLm.createLMPool()) {
+ result = kenLm.probRule(longIds, poolPointer);
+ }
// THEN
+ assertThat(result, is(notNullValue()));
assertThat(result.state.getState(), is(0L));
assertThat(result.prob, is(-3.7906885f));
}
[7/7] incubator-joshua git commit: Merge branch
'kellen-kenlm_pool_fix'
Posted by mj...@apache.org.
Merge branch 'kellen-kenlm_pool_fix'
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/90fff5ab
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/90fff5ab
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/90fff5ab
Branch: refs/heads/master
Commit: 90fff5ab1de3da23c0f64f90e69ce0da2392fd49
Parents: 99e8f95 bdd670b
Author: Matt Post <po...@cs.jhu.edu>
Authored: Tue Sep 13 17:48:21 2016 +0200
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Tue Sep 13 17:48:21 2016 +0200
----------------------------------------------------------------------
jni/kenlm_wrap.cc | 144 +++++++++----------
.../org/apache/joshua/decoder/KenLMPool.java | 42 ++++++
.../decoder/LanguageModelStateManager.java | 29 ++++
.../org/apache/joshua/decoder/Translation.java | 17 +--
.../org/apache/joshua/decoder/ff/lm/KenLM.java | 24 ++--
.../ff/lm/StateMinimizingLanguageModel.java | 30 +---
.../joshua/decoder/segment_file/Sentence.java | 11 ++
.../org/apache/joshua/system/KenLmTest.java | 30 ++++
8 files changed, 196 insertions(+), 131 deletions(-)
----------------------------------------------------------------------
[5/7] incubator-joshua git commit: Moved resources, disabled test
Posted by mj...@apache.org.
Moved resources, disabled test
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/99e8f951
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/99e8f951
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/99e8f951
Branch: refs/heads/master
Commit: 99e8f951009213b55595c83cab2f0db3701c04ca
Parents: 49bbcac
Author: Matt Post <po...@cs.jhu.edu>
Authored: Tue Sep 13 16:32:15 2016 +0200
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Tue Sep 13 16:32:15 2016 +0200
----------------------------------------------------------------------
.../decoder/phrase/constrained/corpus.es | 1 -
.../decoder/phrase/constrained/glue.grammar | 3 --
.../decoder/phrase/constrained/joshua.config | 29 -----------------
.../decoder/phrase/constrained/output.gold | 5 ---
.../decoder/phrase/constrained/test.sh | 32 -------------------
src/test/resources/decoder/phrase/decode/config | 29 -----------------
.../decoder/phrase/decode/config.packed | 29 -----------------
.../resources/decoder/phrase/decode/corpus.es | 1 -
.../resources/decoder/phrase/decode/lm.1.gz | Bin 2235 -> 0 bytes
.../resources/decoder/phrase/decode/output.gold | 1 -
.../resources/decoder/phrase/decode/rules.1.gz | Bin 2998042 -> 0 bytes
.../decoder/phrase/decode/rules.packed/config | 2 --
.../decoder/phrase/decode/rules.packed/encoding | Bin 87 -> 0 bytes
.../decode/rules.packed/slice_00000.features | Bin 4128858 -> 0 bytes
.../decode/rules.packed/slice_00000.source | Bin 1982228 -> 0 bytes
.../decode/rules.packed/slice_00000.target | Bin 1463856 -> 0 bytes
.../rules.packed/slice_00000.target.lookup | Bin 28 -> 0 bytes
.../phrase/decode/rules.packed/vocabulary | Bin 169225 -> 0 bytes
.../decoder/phrase/decode/test-packed.sh | 32 -------------------
.../resources/decoder/phrase/decode/test.sh | 17 ----------
src/test/resources/phrase_decoder/config.packed | 29 +++++++++++++++++
.../phrase_decoder/rules.packed/config | 2 ++
.../phrase_decoder/rules.packed/encoding | Bin 0 -> 87 bytes
.../rules.packed/slice_00000.features | Bin 0 -> 4128858 bytes
.../rules.packed/slice_00000.source | Bin 0 -> 1982228 bytes
.../rules.packed/slice_00000.target | Bin 0 -> 1463856 bytes
.../rules.packed/slice_00000.target.lookup | Bin 0 -> 28 bytes
.../phrase_decoder/rules.packed/vocabulary | Bin 0 -> 169225 bytes
28 files changed, 31 insertions(+), 181 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/constrained/corpus.es
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/constrained/corpus.es b/src/test/resources/decoder/phrase/constrained/corpus.es
deleted file mode 100644
index a063f9a..0000000
--- a/src/test/resources/decoder/phrase/constrained/corpus.es
+++ /dev/null
@@ -1 +0,0 @@
-una estrategia republicana para obstaculizar la reelecci�n de Obama ||| President Obama to hinder a strategy for Republican re @-@ election
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/constrained/glue.grammar
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/constrained/glue.grammar b/src/test/resources/decoder/phrase/constrained/glue.grammar
deleted file mode 100644
index 6a1162f..0000000
--- a/src/test/resources/decoder/phrase/constrained/glue.grammar
+++ /dev/null
@@ -1,3 +0,0 @@
-[GOAL] ||| <s> ||| <s> ||| 0
-[GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1
-[GOAL] ||| [GOAL,1] </s> ||| [GOAL,1] </s> ||| 0
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/constrained/joshua.config
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/constrained/joshua.config b/src/test/resources/decoder/phrase/constrained/joshua.config
deleted file mode 100644
index be45e0a..0000000
--- a/src/test/resources/decoder/phrase/constrained/joshua.config
+++ /dev/null
@@ -1,29 +0,0 @@
-tm = moses pt 0 ../decode/rules.1.gz
-
-lm = kenlm 5 true false 100 ../decode/lm.1.gz
-
-mark-oovs = false
-pop-limit = 10
-top-n = 5
-
-output-format = %i ||| %s ||| %f ||| %c
-
-include-align-index = false
-reordering-limit = 10
-use-unique-nbest = false
-
-# And these are the feature functions to activate.
-feature-function = OOVPenalty
-feature-function = WordPenalty
-feature-function = Distortion
-feature-function = PhrasePenalty -owner pt
-
-OOVPenalty 1.0
-Distortion 0.114849
-WordPenalty -0.201544
-PhrasePenalty -0.236965
-tm_pt_0 0.0370068
-tm_pt_1 0.0495759
-tm_pt_2 0.196742
-tm_pt_3 0.0745423
-lm_0 0.204412452147565
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/constrained/output.gold
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/constrained/output.gold b/src/test/resources/decoder/phrase/constrained/output.gold
deleted file mode 100644
index a784043..0000000
--- a/src/test/resources/decoder/phrase/constrained/output.gold
+++ /dev/null
@@ -1,5 +0,0 @@
-0 ||| President Obama to hinder a strategy for Republican re @-@ election ||| tm_pt_0=-15.792 tm_pt_1=-17.550 tm_pt_2=-14.599 tm_pt_3=-18.298 lm_0=-29.452 OOVPenalty=0.000 WordPenalty=-4.777 Distortion=-24.000 PhrasePenalty=7.000 ||| -15.163
-0 ||| President Obama to hinder a strategy for Republican re @-@ election ||| tm_pt_0=-16.919 tm_pt_1=-17.550 tm_pt_2=-14.917 tm_pt_3=-18.298 lm_0=-29.452 OOVPenalty=0.000 WordPenalty=-4.777 Distortion=-24.000 PhrasePenalty=8.000 ||| -15.505
-0 ||| President Obama to hinder a strategy for Republican re @-@ election ||| tm_pt_0=-14.986 tm_pt_1=-17.951 tm_pt_2=-14.075 tm_pt_3=-18.699 lm_0=-29.452 OOVPenalty=0.000 WordPenalty=-4.777 Distortion=-32.000 PhrasePenalty=6.000 ||| -15.762
-0 ||| President Obama to hinder a strategy for Republican re @-@ election ||| tm_pt_0=-16.112 tm_pt_1=-17.951 tm_pt_2=-14.393 tm_pt_3=-18.699 lm_0=-29.452 OOVPenalty=0.000 WordPenalty=-4.777 Distortion=-32.000 PhrasePenalty=7.000 ||| -16.103
-0 ||| President Obama to hinder a strategy for Republican re @-@ election ||| tm_pt_0=-16.329 tm_pt_1=-17.951 tm_pt_2=-15.136 tm_pt_3=-18.699 lm_0=-29.452 OOVPenalty=0.000 WordPenalty=-4.777 Distortion=-32.000 PhrasePenalty=7.000 ||| -16.257
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/constrained/test.sh
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/constrained/test.sh b/src/test/resources/decoder/phrase/constrained/test.sh
deleted file mode 100755
index 6bef145..0000000
--- a/src/test/resources/decoder/phrase/constrained/test.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-set -u
-
-cat corpus.es | $JOSHUA/bin/joshua-decoder -threads 1 -c joshua.config > output 2> log
-
-# Compare
-diff -u output output.gold > diff
-
-if [ $? -eq 0 ]; then
- rm -f diff output log
- exit 0
-else
- exit 1
-fi
-
-
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/config
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/config b/src/test/resources/decoder/phrase/decode/config
deleted file mode 100644
index 7e6f2b8..0000000
--- a/src/test/resources/decoder/phrase/decode/config
+++ /dev/null
@@ -1,29 +0,0 @@
-tm = moses -owner pt -maxspan 0 -path rules.1.gz -max-source-len 5
-feature-function = StateMinimizingLanguageModel -lm_order 5 -lm_file lm.1.gz
-
-search = stack
-
-mark-oovs = false
-pop-limit = 10
-top-n = 1
-
-output-format = %i ||| %s ||| %f ||| %c
-
-include-align-index = false
-reordering-limit = 6
-
-# And these are the feature functions to activate.
-feature-function = OOVPenalty
-feature-function = WordPenalty
-feature-function = Distortion
-feature-function = PhrasePenalty -owner pt
-
-OOVPenalty 1.0
-Distortion 0.114849
-WordPenalty -0.201544
-PhrasePenalty -0.236965
-tm_pt_0 0.0370068
-tm_pt_1 0.0495759
-tm_pt_2 0.196742
-tm_pt_3 0.0745423
-lm_0 0.204412452147565
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/config.packed
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/config.packed b/src/test/resources/decoder/phrase/decode/config.packed
deleted file mode 100644
index 9987b1a..0000000
--- a/src/test/resources/decoder/phrase/decode/config.packed
+++ /dev/null
@@ -1,29 +0,0 @@
-tm = moses -owner pt -maxspan 0 -path rules.packed -max-source-len 5
-feature-function = StateMinimizingLanguageModel -lm_order 5 -lm_file lm.1.gz
-
-search = stack
-
-mark-oovs = false
-pop-limit = 10
-top-n = 1
-
-output-format = %i ||| %s ||| %f ||| %c
-
-include-align-index = false
-reordering-limit = 6
-
-# And these are the feature functions to activate.
-feature-function = OOVPenalty
-feature-function = WordPenalty
-feature-function = Distortion
-feature-function = PhrasePenalty -owner pt
-
-OOVPenalty 1.0
-Distortion 0.114849
-WordPenalty -0.201544
-PhrasePenalty -0.236965
-tm_pt_0 0.0370068
-tm_pt_1 0.0495759
-tm_pt_2 0.196742
-tm_pt_3 0.0745423
-lm_0 0.204412452147565
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/corpus.es
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/corpus.es b/src/test/resources/decoder/phrase/decode/corpus.es
deleted file mode 100644
index 6e255f9..0000000
--- a/src/test/resources/decoder/phrase/decode/corpus.es
+++ /dev/null
@@ -1 +0,0 @@
-una estrategia republicana para obstaculizar la reelecci�n de Obama
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/lm.1.gz
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/lm.1.gz b/src/test/resources/decoder/phrase/decode/lm.1.gz
deleted file mode 100644
index 3f4c453..0000000
Binary files a/src/test/resources/decoder/phrase/decode/lm.1.gz and /dev/null differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/output.gold
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/output.gold b/src/test/resources/decoder/phrase/decode/output.gold
deleted file mode 100644
index 0083345..0000000
--- a/src/test/resources/decoder/phrase/decode/output.gold
+++ /dev/null
@@ -1 +0,0 @@
-0 ||| a strategy republican to hinder reelection Obama ||| tm_pt_0=-9.702 tm_pt_1=-10.800 tm_pt_2=-7.543 tm_pt_3=-8.555 lm_0=-19.117 OOVPenalty=0.000 WordPenalty=-3.040 Distortion=0.000 PhrasePenalty=5.000 ||| -7.496
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/rules.1.gz
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/rules.1.gz b/src/test/resources/decoder/phrase/decode/rules.1.gz
deleted file mode 100644
index 14466e9..0000000
Binary files a/src/test/resources/decoder/phrase/decode/rules.1.gz and /dev/null differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/rules.packed/config
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/rules.packed/config b/src/test/resources/decoder/phrase/decode/rules.packed/config
deleted file mode 100644
index 2251fe6..0000000
--- a/src/test/resources/decoder/phrase/decode/rules.packed/config
+++ /dev/null
@@ -1,2 +0,0 @@
-version = 4
-max-source-len = 3
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/rules.packed/encoding
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/rules.packed/encoding b/src/test/resources/decoder/phrase/decode/rules.packed/encoding
deleted file mode 100644
index 57e7b75..0000000
Binary files a/src/test/resources/decoder/phrase/decode/rules.packed/encoding and /dev/null differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.features
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.features b/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.features
deleted file mode 100644
index 2a77e43..0000000
Binary files a/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.features and /dev/null differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.source
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.source b/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.source
deleted file mode 100644
index c384c54..0000000
Binary files a/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.source and /dev/null differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.target
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.target b/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.target
deleted file mode 100644
index 8375cf0..0000000
Binary files a/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.target and /dev/null differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.target.lookup
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.target.lookup b/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.target.lookup
deleted file mode 100644
index 3e8c294..0000000
Binary files a/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.target.lookup and /dev/null differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/rules.packed/vocabulary
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/rules.packed/vocabulary b/src/test/resources/decoder/phrase/decode/rules.packed/vocabulary
deleted file mode 100644
index 528a970..0000000
Binary files a/src/test/resources/decoder/phrase/decode/rules.packed/vocabulary and /dev/null differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/test-packed.sh
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/test-packed.sh b/src/test/resources/decoder/phrase/decode/test-packed.sh
deleted file mode 100755
index a65c031..0000000
--- a/src/test/resources/decoder/phrase/decode/test-packed.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-set -u
-
-cat corpus.es | $JOSHUA/bin/joshua-decoder -threads 1 -c config.packed > output 2> log
-
-# Compare
-diff -u output output.gold > diff
-
-if [ $? -eq 0 ]; then
- rm -f diff output log
- exit 0
-else
- exit 1
-fi
-
-
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/test.sh
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/test.sh b/src/test/resources/decoder/phrase/decode/test.sh
deleted file mode 100755
index 4732f73..0000000
--- a/src/test/resources/decoder/phrase/decode/test.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-
-set -u
-
-cat corpus.es | $JOSHUA/bin/joshua-decoder -threads 1 -c config > output 2> log
-
-# Compare
-diff -u output output.gold > diff
-
-if [ $? -eq 0 ]; then
- rm -f diff output log
- exit 0
-else
- exit 1
-fi
-
-
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/phrase_decoder/config.packed
----------------------------------------------------------------------
diff --git a/src/test/resources/phrase_decoder/config.packed b/src/test/resources/phrase_decoder/config.packed
new file mode 100644
index 0000000..9987b1a
--- /dev/null
+++ b/src/test/resources/phrase_decoder/config.packed
@@ -0,0 +1,29 @@
+tm = moses -owner pt -maxspan 0 -path rules.packed -max-source-len 5
+feature-function = StateMinimizingLanguageModel -lm_order 5 -lm_file lm.1.gz
+
+search = stack
+
+mark-oovs = false
+pop-limit = 10
+top-n = 1
+
+output-format = %i ||| %s ||| %f ||| %c
+
+include-align-index = false
+reordering-limit = 6
+
+# And these are the feature functions to activate.
+feature-function = OOVPenalty
+feature-function = WordPenalty
+feature-function = Distortion
+feature-function = PhrasePenalty -owner pt
+
+OOVPenalty 1.0
+Distortion 0.114849
+WordPenalty -0.201544
+PhrasePenalty -0.236965
+tm_pt_0 0.0370068
+tm_pt_1 0.0495759
+tm_pt_2 0.196742
+tm_pt_3 0.0745423
+lm_0 0.204412452147565
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/phrase_decoder/rules.packed/config
----------------------------------------------------------------------
diff --git a/src/test/resources/phrase_decoder/rules.packed/config b/src/test/resources/phrase_decoder/rules.packed/config
new file mode 100644
index 0000000..2251fe6
--- /dev/null
+++ b/src/test/resources/phrase_decoder/rules.packed/config
@@ -0,0 +1,2 @@
+version = 4
+max-source-len = 3
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/phrase_decoder/rules.packed/encoding
----------------------------------------------------------------------
diff --git a/src/test/resources/phrase_decoder/rules.packed/encoding b/src/test/resources/phrase_decoder/rules.packed/encoding
new file mode 100644
index 0000000..57e7b75
Binary files /dev/null and b/src/test/resources/phrase_decoder/rules.packed/encoding differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/phrase_decoder/rules.packed/slice_00000.features
----------------------------------------------------------------------
diff --git a/src/test/resources/phrase_decoder/rules.packed/slice_00000.features b/src/test/resources/phrase_decoder/rules.packed/slice_00000.features
new file mode 100644
index 0000000..2a77e43
Binary files /dev/null and b/src/test/resources/phrase_decoder/rules.packed/slice_00000.features differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/phrase_decoder/rules.packed/slice_00000.source
----------------------------------------------------------------------
diff --git a/src/test/resources/phrase_decoder/rules.packed/slice_00000.source b/src/test/resources/phrase_decoder/rules.packed/slice_00000.source
new file mode 100644
index 0000000..c384c54
Binary files /dev/null and b/src/test/resources/phrase_decoder/rules.packed/slice_00000.source differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/phrase_decoder/rules.packed/slice_00000.target
----------------------------------------------------------------------
diff --git a/src/test/resources/phrase_decoder/rules.packed/slice_00000.target b/src/test/resources/phrase_decoder/rules.packed/slice_00000.target
new file mode 100644
index 0000000..8375cf0
Binary files /dev/null and b/src/test/resources/phrase_decoder/rules.packed/slice_00000.target differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/phrase_decoder/rules.packed/slice_00000.target.lookup
----------------------------------------------------------------------
diff --git a/src/test/resources/phrase_decoder/rules.packed/slice_00000.target.lookup b/src/test/resources/phrase_decoder/rules.packed/slice_00000.target.lookup
new file mode 100644
index 0000000..3e8c294
Binary files /dev/null and b/src/test/resources/phrase_decoder/rules.packed/slice_00000.target.lookup differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/phrase_decoder/rules.packed/vocabulary
----------------------------------------------------------------------
diff --git a/src/test/resources/phrase_decoder/rules.packed/vocabulary b/src/test/resources/phrase_decoder/rules.packed/vocabulary
new file mode 100644
index 0000000..528a970
Binary files /dev/null and b/src/test/resources/phrase_decoder/rules.packed/vocabulary differ
[3/7] incubator-joshua git commit: Adapted Java side of JNI interface
to get state and prob from packed long
Posted by mj...@apache.org.
Adapted Java side of JNI interface to get state and prob from packed long
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/4e07bb66
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/4e07bb66
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/4e07bb66
Branch: refs/heads/master
Commit: 4e07bb66d28e55357ee6b19b3c60a76a31d8dd75
Parents: 929760a
Author: Kellen Sunderland <ke...@amazon.com>
Authored: Tue Sep 13 12:39:41 2016 +0200
Committer: Kellen Sunderland <ke...@amazon.com>
Committed: Tue Sep 13 12:39:41 2016 +0200
----------------------------------------------------------------------
jni/kenlm_wrap.cc | 9 +++---
.../org/apache/joshua/decoder/ff/lm/KenLM.java | 7 +++--
.../org/apache/joshua/system/KenLmTest.java | 29 ++++++++++++++++++++
3 files changed, 39 insertions(+), 6 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/4e07bb66/jni/kenlm_wrap.cc
----------------------------------------------------------------------
diff --git a/jni/kenlm_wrap.cc b/jni/kenlm_wrap.cc
index 8f69e19..bbe6e7c 100644
--- a/jni/kenlm_wrap.cc
+++ b/jni/kenlm_wrap.cc
@@ -95,7 +95,7 @@ class Chart {
return *ins.first;
}
- const ChartState &InterpretState(StateIndex index) const {
+ const lm::ngram::ChartState &InterpretState(StateIndex index) const {
return vec_[index];
}
@@ -201,7 +201,7 @@ public:
lm::ngram::RuleScore<Model> ruleScore(m_, state);
if (*begin < 0) {
- ruleScore.BeginNonTerminal(chart.Interpet(-*begin));
+ ruleScore.BeginNonTerminal(chart.InterpretState(-*begin));
} else {
const lm::WordIndex word = map_[*begin];
if (word == m_.GetVocabulary().BeginSentence()) {
@@ -213,7 +213,7 @@ public:
for (jlong* i = begin + 1; i != end; i++) {
long word = *i;
if (word < 0)
- ruleScore.NonTerminal(chart.Interpret(-word));
+ ruleScore.NonTerminal(chart.InterpretState(-word));
else
ruleScore.Terminal(map_[word]);
}
@@ -449,6 +449,7 @@ union FloatConverter {
JNIEXPORT jlong JNICALL Java_org_apache_joshua_decoder_ff_lm_KenLM_probRule(
JNIEnv *env, jclass, jlong pointer, jlong chartPtr, jlongArray arr) {
+
jint length = env->GetArrayLength(arr);
// GCC only.
jlong values[length];
@@ -458,7 +459,7 @@ JNIEXPORT jlong JNICALL Java_org_apache_joshua_decoder_ff_lm_KenLM_probRule(
lm::ngram::ChartState outState;
const VirtualBase *base = reinterpret_cast<const VirtualBase*>(pointer);
Chart* chart = reinterpret_cast<Chart*>(chartPtr);
- FloatConvert prob;
+ FloatConverter prob;
prob.f = base->ProbRule(values, values + length, outState, *chart);
StateIndex index = chart->Intern(outState);
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/4e07bb66/src/main/java/org/apache/joshua/decoder/ff/lm/KenLM.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/lm/KenLM.java b/src/main/java/org/apache/joshua/decoder/ff/lm/KenLM.java
index 044c85f..b0a1117 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/lm/KenLM.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/lm/KenLM.java
@@ -61,7 +61,7 @@ public class KenLM implements NGramLanguageModel, Comparable<KenLM> {
private static native boolean isLmOov(long ptr, int word);
- private static native StateProbPair probRule(long ptr, long pool, long words[]);
+ private static native long probRule(long ptr, long pool, long words[]);
private static native float estimateRule(long ptr, long words[]);
@@ -161,7 +161,10 @@ public class KenLM implements NGramLanguageModel, Comparable<KenLM> {
StateProbPair pair = null;
try {
- pair = probRule(pointer, poolPointer, words);
+ long packedResult = probRule(pointer, poolPointer, words);
+ int state = (int) (packedResult >> 32);
+ float probVal = Float.intBitsToFloat((int)packedResult);
+ pair = new StateProbPair(state, probVal);
} catch (NoSuchMethodError e) {
e.printStackTrace();
System.exit(1);
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/4e07bb66/src/test/java/org/apache/joshua/system/KenLmTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/joshua/system/KenLmTest.java b/src/test/java/org/apache/joshua/system/KenLmTest.java
index 74baef3..aa396d2 100644
--- a/src/test/java/org/apache/joshua/system/KenLmTest.java
+++ b/src/test/java/org/apache/joshua/system/KenLmTest.java
@@ -27,6 +27,10 @@ import org.testng.annotations.Test;
import static org.apache.joshua.corpus.Vocabulary.registerLanguageModel;
import static org.apache.joshua.corpus.Vocabulary.unregisterLanguageModels;
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.core.Is.is;
+import static org.mockito.Matchers.isNotNull;
+import static org.mockito.Matchers.notNull;
import static org.testng.Assert.assertTrue;
import static org.testng.AssertJUnit.assertEquals;
import static org.testng.AssertJUnit.assertFalse;
@@ -80,6 +84,31 @@ public class KenLmTest {
}
@Test
+ public void givenKenLm_whenQueryingForNgramProbability2_thenIdAndStringMethodsReturnTheSame() {
+ // GIVEN
+ KenLmTestUtil.Guard(() -> kenLm = new KenLM(LANGUAGE_MODEL_PATH));
+
+ registerLanguageModel(kenLm);
+ String sentence = "Wayne Gretzky";
+ String[] words = sentence.split("\\s+");
+ int[] ids = Vocabulary.addAll(sentence);
+ long[] longIds = new long[ids.length];
+
+ for(int i = 0; i< words.length; i++) {
+ longIds[i] = ids[i];
+ }
+
+ // WHEN
+ long poolPointer = kenLm.createLMPool();
+ KenLM.StateProbPair result = kenLm.probRule(longIds, poolPointer);
+ kenLm.destroyLMPool(poolPointer);
+
+ // THEN
+ assertThat(result.state.getState(), is(0L));
+ assertThat(result.prob, is(-3.7906885f));
+ }
+
+ @Test
public void givenKenLm_whenIsKnownWord_thenReturnValuesAreCorrect() {
KenLmTestUtil.Guard(() -> kenLm = new KenLM(LANGUAGE_MODEL_PATH));
assertTrue(kenLm.isKnownWord("Wayne"));