You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/09/13 15:48:50 UTC

[1/7] incubator-joshua git commit: Probably won't compile but gets the idea across

Repository: incubator-joshua
Updated Branches:
  refs/heads/master 49bbcac01 -> 90fff5ab1


Probably won't compile but gets the idea across


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/5e954752
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/5e954752
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/5e954752

Branch: refs/heads/master
Commit: 5e9547526ad4bc15f48e665608897def552cb9ab
Parents: c3e7a15
Author: Kenneth Heafield <gi...@kheafield.com>
Authored: Tue Sep 13 10:58:26 2016 +0200
Committer: Kenneth Heafield <gi...@kheafield.com>
Committed: Tue Sep 13 10:58:26 2016 +0200

----------------------------------------------------------------------
 jni/kenlm_wrap.cc | 142 ++++++++++++++++++++++---------------------------
 1 file changed, 63 insertions(+), 79 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5e954752/jni/kenlm_wrap.cc
----------------------------------------------------------------------
diff --git a/jni/kenlm_wrap.cc b/jni/kenlm_wrap.cc
index 11d9c28..8f69e19 100644
--- a/jni/kenlm_wrap.cc
+++ b/jni/kenlm_wrap.cc
@@ -20,7 +20,6 @@
 #include "lm/left.hh"
 #include "lm/state.hh"
 #include "util/murmur_hash.hh"
-#include "util/pool.hh"
 
 #include <iostream>
 
@@ -30,7 +29,8 @@
 #include <pthread.h>
 
 // Grr.  Everybody's compiler is slightly different and I'm trying to not depend on boost.
-#include <unordered_map>
+#include <unordered_set>
+#include <vector>
 
 // Verify that jint and lm::ngram::WordIndex are the same size. If this breaks
 // for you, there's a need to revise probString.
@@ -45,7 +45,35 @@ template<> struct StaticCheck<true> {
 
 typedef StaticCheck<sizeof(jint) == sizeof(lm::WordIndex)>::StaticAssertionPassed FloatSize;
 
-typedef std::unordered_multimap<uint64_t, lm::ngram::ChartState*> PoolHash;
+// Could be uint64_t if you wanted to have 33-bit support.
+typedef uint32_t StateIndex;
+typedef std::vector<lm::ngram::ChartState> StateVector;
+
+class HashIndex : public std::unary_function<StateIndex, uint64_t> {
+  public:
+    explicit HashIndex(const StateVector &vec) : vec_(vec) {}
+
+    uint64_t operator()(StateIndex index) const {
+      return hash_value(vec_[index]);
+    }
+
+  private:
+    const StateVector &vec_;
+};
+
+class EqualIndex : public std::binary_function<StateIndex, StateIndex, bool> {
+  public:
+    explicit EqualIndex(const StateVector &vec) : vec_(vec) {}
+
+    bool operator()(StateIndex first, StateIndex second) const {
+      return vec_[first] == vec_[second];
+    }
+
+  private:
+    const StateVector &vec_;
+};
+
+typedef std::unordered_set<StateIndex, HashIndex, EqualIndex> Lookup;
 
 /**
  * A Chart bundles together a unordered_multimap that maps ChartState signatures to a single
@@ -54,46 +82,26 @@ typedef std::unordered_multimap<uint64_t, lm::ngram::ChartState*> PoolHash;
  * across KenLMs for the same sentence.  Multimap is used to avoid hash collisions which can
  * return incorrect results, and cause out-of-bounds lookups when multiple KenLMs are in use.
  */
-struct Chart {
-  // A cache for allocated chart objects
-  PoolHash* poolHash;
-  // Pool used to allocate new ones
-  util::Pool* pool;
-
-  Chart() {
-    poolHash = new PoolHash();
-    pool = new util::Pool();
-  }
-
-  ~Chart() {
-    delete poolHash;
-    pool->FreeAll();
-    delete pool;
-  }
-
-  lm::ngram::ChartState* put(const lm::ngram::ChartState& state) {
-    lm::ngram::ChartState* state_ptr = nullptr;
-    uint64_t hashValue = lm::ngram::hash_value(state);
-    auto state_it = poolHash->find(hashValue);
-
-    // Try to retrieve a matching ChartState pointer from our Pool
-    while(state_it != poolHash->end()) {
-      if (state == *(state_it->second)) {
-        state_ptr = state_it->second;
-        break;
+class Chart {
+  public:
+    Chart() : lookup_(1000, HashIndex(vec_), EqualIndex(vec_)) {}
+
+    StateIndex Intern(const lm::ngram::ChartState &state) {
+      vec_.push_back(state);
+      std::pair<Lookup::iterator, bool> ins(lookup_.insert(vec_.size() - 1));
+      if (!ins.second) {
+        vec_.pop_back();
       }
-      state_it++;
+      return *ins.first;
     }
 
-    // Unable to find this ChartState in our pool, allocate new space for it
-    if (!state_ptr) {
-      state_ptr = (lm::ngram::ChartState *) pool->Allocate(sizeof(lm::ngram::ChartState));
-      *state_ptr = state;
-      (*poolHash).insert({hashValue, state_ptr});
+    const ChartState &InterpretState(StateIndex index) const {
+      return vec_[index];
     }
 
-    return state_ptr;
-  }
+  private:
+    StateVector vec_;
+    Lookup lookup_;
 };
 
 // Vocab ids above what the vocabulary knows about are unknown and should
@@ -131,7 +139,7 @@ public:
 
   virtual bool IsKnownWordIndex(const lm::WordIndex& id) const = 0;
 
-  virtual float ProbRule(jlong *begin, jlong *end, lm::ngram::ChartState& state) const = 0;
+  virtual float ProbRule(jlong *begin, jlong *end, lm::ngram::ChartState& state, const Chart &chart) const = 0;
 
   virtual float ProbString(jint * const begin, jint * const end,
       jint start) const = 0;
@@ -142,22 +150,9 @@ public:
 
   virtual bool RegisterWord(const StringPiece& word, const int joshua_id) = 0;
 
-  void RememberReturnMethod(jclass chart_pair, jmethodID chart_pair_init) {
-    chart_pair_ = chart_pair;
-    chart_pair_init_ = chart_pair_init;
-  }
-
-  jclass ChartPair() const { return chart_pair_; }
-  jmethodID ChartPairInit() const { return chart_pair_init_; }
-
 protected:
   VirtualBase() {
   }
-
-private:
-  // Hack: these are remembered so we can avoid looking them up every time.
-  jclass chart_pair_;
-  jmethodID chart_pair_init_;
 };
 
 template<class Model> class VirtualImpl: public VirtualBase {
@@ -201,12 +196,12 @@ public:
       return id != m_.GetVocabulary().NotFound();
   }
 
-  float ProbRule(jlong * const begin, jlong * const end, lm::ngram::ChartState& state) const {
+  float ProbRule(jlong * const begin, jlong * const end, lm::ngram::ChartState& state, const Chart &chart) const {
     if (begin == end) return 0.0;
     lm::ngram::RuleScore<Model> ruleScore(m_, state);
 
     if (*begin < 0) {
-      ruleScore.BeginNonTerminal(*reinterpret_cast<const lm::ngram::ChartState*>(-*begin));
+      ruleScore.BeginNonTerminal(chart.Interpet(-*begin));
     } else {
       const lm::WordIndex word = map_[*begin];
       if (word == m_.GetVocabulary().BeginSentence()) {
@@ -218,7 +213,7 @@ public:
     for (jlong* i = begin + 1; i != end; i++) {
       long word = *i;
       if (word < 0)
-        ruleScore.NonTerminal(*reinterpret_cast<const lm::ngram::ChartState*>(-word));
+        ruleScore.NonTerminal(chart.Interpret(-word));
       else
         ruleScore.Terminal(map_[word]);
     }
@@ -341,18 +336,6 @@ JNIEXPORT jlong JNICALL Java_org_apache_joshua_decoder_ff_lm_KenLM_construct(
   VirtualBase *ret;
   try {
     ret = ConstructModel(str);
-
-    // Get a class reference for the type pair that char
-    jclass local_chart_pair = env->FindClass("org/apache/joshua/decoder/ff/lm/KenLM$StateProbPair");
-    UTIL_THROW_IF(!local_chart_pair, util::Exception, "Failed to find org/apache/joshua/decoder/ff/lm/KenLM$StateProbPair");
-    jclass chart_pair = (jclass)env->NewGlobalRef(local_chart_pair);
-    env->DeleteLocalRef(local_chart_pair);
-
-    // Get the Method ID of the constructor which takes an int
-    jmethodID chart_pair_init = env->GetMethodID(chart_pair, "<init>", "(JF)V");
-    UTIL_THROW_IF(!chart_pair_init, util::Exception, "Failed to find init method");
-
-    ret->RememberReturnMethod(chart_pair, chart_pair_init);
   } catch (std::exception &e) {
     std::cerr << e.what() << std::endl;
     abort();
@@ -363,20 +346,17 @@ JNIEXPORT jlong JNICALL Java_org_apache_joshua_decoder_ff_lm_KenLM_construct(
 
 JNIEXPORT void JNICALL Java_org_apache_joshua_decoder_ff_lm_KenLM_destroy(
     JNIEnv *env, jclass, jlong pointer) {
-  VirtualBase *base = reinterpret_cast<VirtualBase*>(pointer);
-  env->DeleteGlobalRef(base->ChartPair());
-  delete base;
+  delete reinterpret_cast<VirtualBase*>(pointer);
 }
 
-JNIEXPORT long JNICALL Java_org_apache_joshua_decoder_ff_lm_KenLM_createPool(
+JNIEXPORT jlong JNICALL Java_org_apache_joshua_decoder_ff_lm_KenLM_createPool(
     JNIEnv *env, jclass) {
   return reinterpret_cast<long>(new Chart());
 }
 
 JNIEXPORT void JNICALL Java_org_apache_joshua_decoder_ff_lm_KenLM_destroyPool(
     JNIEnv *env, jclass, jlong pointer) {
-  Chart* chart = reinterpret_cast<Chart*>(pointer);
-  delete chart;
+  delete reinterpret_cast<Chart*>(pointer);
 }
 
 JNIEXPORT jint JNICALL Java_org_apache_joshua_decoder_ff_lm_KenLM_order(
@@ -462,7 +442,12 @@ JNIEXPORT jfloat JNICALL Java_org_apache_joshua_decoder_ff_lm_KenLM_probString(
       values + length, start);
 }
 
-JNIEXPORT jobject JNICALL Java_org_apache_joshua_decoder_ff_lm_KenLM_probRule(
+union FloatConverter {
+  float f;
+  uint32_t i;
+};
+
+JNIEXPORT jlong JNICALL Java_org_apache_joshua_decoder_ff_lm_KenLM_probRule(
   JNIEnv *env, jclass, jlong pointer, jlong chartPtr, jlongArray arr) {
   jint length = env->GetArrayLength(arr);
   // GCC only.
@@ -472,13 +457,12 @@ JNIEXPORT jobject JNICALL Java_org_apache_joshua_decoder_ff_lm_KenLM_probRule(
   // Compute the probability
   lm::ngram::ChartState outState;
   const VirtualBase *base = reinterpret_cast<const VirtualBase*>(pointer);
-  float prob = base->ProbRule(values, values + length, outState);
-
   Chart* chart = reinterpret_cast<Chart*>(chartPtr);
-  lm::ngram::ChartState* outStatePtr = chart->put(outState);
+  FloatConvert prob;
+  prob.f = base->ProbRule(values, values + length, outState, *chart);
 
-  // Call back constructor to allocate a new instance, with an int argument
-  return env->NewObject(base->ChartPair(), base->ChartPairInit(), (long)outStatePtr, prob);
+  StateIndex index = chart->Intern(outState);
+  return static_cast<uint64_t>(index) << 32 | static_cast<uint64_t>(prob.i);
 }
 
 JNIEXPORT jfloat JNICALL Java_org_apache_joshua_decoder_ff_lm_KenLM_estimateRule(


[2/7] incubator-joshua git commit: Merge branch 'master' of https://github.com/KellenSunderland/incubator-joshua

Posted by mj...@apache.org.
Merge branch 'master' of https://github.com/KellenSunderland/incubator-joshua


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/929760a3
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/929760a3
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/929760a3

Branch: refs/heads/master
Commit: 929760a35dda5f88792c44d6eef41f3e58cf7250
Parents: 3b5811a 5e95475
Author: Kellen Sunderland <ke...@amazon.com>
Authored: Tue Sep 13 11:23:42 2016 +0200
Committer: Kellen Sunderland <ke...@amazon.com>
Committed: Tue Sep 13 11:23:42 2016 +0200

----------------------------------------------------------------------
 jni/kenlm_wrap.cc | 142 ++++++++++++++++++++++---------------------------
 1 file changed, 63 insertions(+), 79 deletions(-)
----------------------------------------------------------------------



[6/7] incubator-joshua git commit: Make ChartState start at index 1.

Posted by mj...@apache.org.
Make ChartState start at index 1.

Fixes bug with state 0 which was getting confused for the vocab id 0 aka <unk>.
The sign bit distinguishes a word from a ChartState id.
Written by @kpu on Kellen's laptop.


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/bdd670bd
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/bdd670bd
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/bdd670bd

Branch: refs/heads/master
Commit: bdd670bd0554a73c0de0db5383e07ce5e8df586f
Parents: 0252942
Author: Kellen Sunderland <ke...@amazon.com>
Authored: Tue Sep 13 17:28:51 2016 +0200
Committer: Kellen Sunderland <ke...@amazon.com>
Committed: Tue Sep 13 17:46:50 2016 +0200

----------------------------------------------------------------------
 jni/kenlm_wrap.cc                                     | 5 +++--
 src/test/java/org/apache/joshua/system/KenLmTest.java | 3 +--
 2 files changed, 4 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/bdd670bd/jni/kenlm_wrap.cc
----------------------------------------------------------------------
diff --git a/jni/kenlm_wrap.cc b/jni/kenlm_wrap.cc
index bbe6e7c..bd82fe4 100644
--- a/jni/kenlm_wrap.cc
+++ b/jni/kenlm_wrap.cc
@@ -92,11 +92,12 @@ class Chart {
       if (!ins.second) {
         vec_.pop_back();
       }
-      return *ins.first;
+      return *ins.first + 1; // +1 so that the first id is 1, not 0.  We use sign bit to 
+                             // distinguish ChartState from vocab id.  
     }
 
     const lm::ngram::ChartState &InterpretState(StateIndex index) const {
-      return vec_[index];
+      return vec_[index - 1];
     }
 
   private:

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/bdd670bd/src/test/java/org/apache/joshua/system/KenLmTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/joshua/system/KenLmTest.java b/src/test/java/org/apache/joshua/system/KenLmTest.java
index 003b5d9..9f26f8f 100644
--- a/src/test/java/org/apache/joshua/system/KenLmTest.java
+++ b/src/test/java/org/apache/joshua/system/KenLmTest.java
@@ -80,7 +80,6 @@ public class KenLmTest {
     // THEN
     assertEquals("ngram probabilities differ for word and id based n-gram query", prob_string, prob_id,
             Float.MIN_VALUE);
-
   }
 
   @Test
@@ -106,7 +105,7 @@ public class KenLmTest {
 
     // THEN
     assertThat(result, is(notNullValue()));
-    assertThat(result.state.getState(), is(0L));
+    assertThat(result.state.getState(), is(1L));
     assertThat(result.prob, is(-3.7906885f));
   }
 


[4/7] incubator-joshua git commit: Manage pool of states on a per LM, per sentence basis

Posted by mj...@apache.org.
Manage pool of states on a per LM, per sentence basis


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/0252942d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/0252942d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/0252942d

Branch: refs/heads/master
Commit: 0252942dafc1679f2c5d6b8d6da7cd6884ca40c3
Parents: 4e07bb6
Author: Kellen Sunderland <ke...@amazon.com>
Authored: Tue Sep 13 13:58:05 2016 +0200
Committer: Kellen Sunderland <ke...@amazon.com>
Committed: Tue Sep 13 15:59:46 2016 +0200

----------------------------------------------------------------------
 .../org/apache/joshua/decoder/KenLMPool.java    | 42 ++++++++++++++++++++
 .../decoder/LanguageModelStateManager.java      | 29 ++++++++++++++
 .../org/apache/joshua/decoder/Translation.java  | 17 +-------
 .../org/apache/joshua/decoder/ff/lm/KenLM.java  | 25 +++++-------
 .../ff/lm/StateMinimizingLanguageModel.java     | 30 ++++----------
 .../joshua/decoder/segment_file/Sentence.java   | 11 +++++
 .../org/apache/joshua/system/KenLmTest.java     | 16 ++++----
 7 files changed, 109 insertions(+), 61 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/0252942d/src/main/java/org/apache/joshua/decoder/KenLMPool.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/KenLMPool.java b/src/main/java/org/apache/joshua/decoder/KenLMPool.java
new file mode 100644
index 0000000..378ac51
--- /dev/null
+++ b/src/main/java/org/apache/joshua/decoder/KenLMPool.java
@@ -0,0 +1,42 @@
+package org.apache.joshua.decoder;
+
+import org.apache.joshua.decoder.ff.lm.KenLM;
+
+/**
+ * Class to wrap a KenLM pool of states.  This class is not ThreadSafe.  It should be
+ * used in a scoped context, and close must be called to release native resources.  It
+ * does implement a custom finalizer that will release these resources if needed, but
+ * this should not be relied on.
+ *
+ * @author Kellen Sunderland
+ */
+
+public class KenLMPool implements AutoCloseable {
+
+  private final long pool;
+  private final KenLM languageModel;
+  private boolean released = false;
+
+  public KenLMPool(long pool, KenLM languageModel) {
+    this.pool = pool;
+    this.languageModel = languageModel;
+  }
+
+  public long getPool() {
+    return pool;
+  }
+
+  @Override
+  protected void finalize() throws Throwable {
+    close();
+    super.finalize();
+  }
+
+  @Override
+  public void close() {
+    if (!released) {
+      released = true;
+      languageModel.destroyLMPool(pool);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/0252942d/src/main/java/org/apache/joshua/decoder/LanguageModelStateManager.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/LanguageModelStateManager.java b/src/main/java/org/apache/joshua/decoder/LanguageModelStateManager.java
new file mode 100644
index 0000000..6a3c4b3
--- /dev/null
+++ b/src/main/java/org/apache/joshua/decoder/LanguageModelStateManager.java
@@ -0,0 +1,29 @@
+package org.apache.joshua.decoder;
+
+import org.apache.joshua.decoder.ff.lm.KenLM;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.UUID;
+
+/**
+ * @author Kellen Sunderland
+ */
+public class LanguageModelStateManager {
+
+  private Map<UUID, KenLMPool> languageModelPoolMapping = new HashMap<>();
+
+  public KenLMPool getStatePool(UUID languageModelId, KenLM languageModel) {
+    KenLMPool statePool = languageModelPoolMapping.get(languageModelId);
+    if (statePool == null) {
+      statePool = languageModel.createLMPool();
+      languageModelPoolMapping.put(languageModelId, statePool);
+    }
+    return statePool;
+  }
+
+  public void clearStatePool() {
+    languageModelPoolMapping.values().forEach(KenLMPool::close);
+    languageModelPoolMapping.clear();
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/0252942d/src/main/java/org/apache/joshua/decoder/Translation.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/Translation.java b/src/main/java/org/apache/joshua/decoder/Translation.java
index ade9b22..ff2aed0 100644
--- a/src/main/java/org/apache/joshua/decoder/Translation.java
+++ b/src/main/java/org/apache/joshua/decoder/Translation.java
@@ -182,8 +182,8 @@ public class Translation {
 
     }
 
-    // remove state from StateMinimizingLanguageModel instances in features.
-    destroyKenLMStates(featureFunctions);
+    // Force any StateMinimizingLanguageModel pool mappings to be cleaned
+    source.getStateManager().clearStatePool();
 
   }
 
@@ -224,17 +224,4 @@ public class Translation {
     }
     return structuredTranslations;
   }
-
-  /**
-   * KenLM hack. If using KenLMFF, we need to tell KenLM to delete the pool used to create chart
-   * objects for this sentence.
-   */
-  private void destroyKenLMStates(final List<FeatureFunction> featureFunctions) {
-    for (FeatureFunction feature : featureFunctions) {
-      if (feature instanceof StateMinimizingLanguageModel) {
-        ((StateMinimizingLanguageModel) feature).destroyPool(getSourceSentence().id());
-        break;
-      }
-    }
-  }
 }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/0252942d/src/main/java/org/apache/joshua/decoder/ff/lm/KenLM.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/lm/KenLM.java b/src/main/java/org/apache/joshua/decoder/ff/lm/KenLM.java
index b0a1117..0646f68 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/lm/KenLM.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/lm/KenLM.java
@@ -19,6 +19,7 @@
 package org.apache.joshua.decoder.ff.lm;
 
 import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.KenLMPool;
 import org.apache.joshua.decoder.ff.state_maintenance.KenLMState;
 import org.apache.joshua.util.FormatUtils;
 import org.slf4j.Logger;
@@ -105,8 +106,8 @@ public class KenLM implements NGramLanguageModel, Comparable<KenLM> {
     }
   }
 
-  public long createLMPool() {
-    return createPool();
+  public KenLMPool createLMPool() {
+    return new KenLMPool(createPool(), this);
   }
 
   public void destroyLMPool(long pointer) {
@@ -153,24 +154,16 @@ public class KenLM implements NGramLanguageModel, Comparable<KenLM> {
    * state and the LM probability incurred along this rule.
    *
    * @param words array of words
-   * @param poolPointer todo
+   * @param poolWrapper an object that wraps a pool reference returned from KenLM createPool
    * @return the updated {@link org.apache.joshua.decoder.ff.lm.KenLM.StateProbPair} e.g.
    * KenLM state and the LM probability incurred along this rule
    */
-  public StateProbPair probRule(long[] words, long poolPointer) {
+  public StateProbPair probRule(long[] words, KenLMPool poolWrapper) {
+    long packedResult = probRule(pointer, poolWrapper.getPool(), words);
+    int state = (int) (packedResult >> 32);
+    float probVal = Float.intBitsToFloat((int)packedResult);
 
-    StateProbPair pair = null;
-    try {
-      long packedResult = probRule(pointer, poolPointer, words);
-      int state = (int) (packedResult >> 32);
-      float probVal = Float.intBitsToFloat((int)packedResult);
-      pair = new StateProbPair(state, probVal);
-    } catch (NoSuchMethodError e) {
-      e.printStackTrace();
-      System.exit(1);
-    }
-
-    return pair;
+    return new StateProbPair(state, probVal);
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/0252942d/src/main/java/org/apache/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java b/src/main/java/org/apache/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
index 4bec379..2219ce8 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/lm/StateMinimizingLanguageModel.java
@@ -21,10 +21,11 @@ package org.apache.joshua.decoder.ff.lm;
 import static org.apache.joshua.util.FormatUtils.isNonterminal;
 
 import java.util.List;
-import java.util.concurrent.ConcurrentHashMap;
+import java.util.UUID;
 
 import org.apache.joshua.corpus.Vocabulary;
 import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.KenLMPool;
 import org.apache.joshua.decoder.chart_parser.SourcePath;
 import org.apache.joshua.decoder.ff.FeatureVector;
 import org.apache.joshua.decoder.ff.lm.KenLM.StateProbPair;
@@ -42,9 +43,6 @@ import org.apache.joshua.decoder.segment_file.Sentence;
  */
 public class StateMinimizingLanguageModel extends LanguageModelFF {
 
-  // maps from sentence numbers to KenLM-side pools used to allocate state
-  private static final ConcurrentHashMap<Integer, Long> poolMap = new ConcurrentHashMap<>();
-
   public StateMinimizingLanguageModel(FeatureVector weights, String[] args, JoshuaConfiguration config) {
     super(weights, args, config);
     this.type = "kenlm";
@@ -87,6 +85,8 @@ public class StateMinimizingLanguageModel extends LanguageModelFF {
     return lmCost + oovCost;
   }
 
+  private UUID languageModelPoolId = UUID.randomUUID();
+
   /**
    * Computes the features incurred along this edge. Note that these features are unweighted costs
    * of the feature; they are the feature cost, not the model cost, or the inner product of them.
@@ -115,14 +115,11 @@ public class StateMinimizingLanguageModel extends LanguageModelFF {
      // map to ken lm ids
     final long[] words = mapToKenLmIds(ruleWords, tailNodes, false);
 
-    final int sentID = sentence.id();
-    // Since sentId is unique across threads, next operations are safe, but not atomic!
-    if (!poolMap.containsKey(sentID)) {
-      poolMap.put(sentID, ((KenLM) languageModel).createLMPool());
-    }
+    KenLMPool statePool = sentence.getStateManager().getStatePool(languageModelPoolId, (KenLM)
+            languageModel);
 
     // Get the probability of applying the rule and the new state
-    final StateProbPair pair = ((KenLM) languageModel).probRule(words, poolMap.get(sentID));
+    final StateProbPair pair = ((KenLM) languageModel).probRule(words, statePool);
 
     // Record the prob
     acc.add(denseFeatureIndex, pair.prob);
@@ -162,19 +159,6 @@ public class StateMinimizingLanguageModel extends LanguageModelFF {
   }
 
   /**
-   * Destroys the pool created to allocate state for this sentence. Called from the
-   * {@link org.apache.joshua.decoder.Translation} class after outputting the sentence or k-best list. Hosting
-   * this map here in KenLMFF statically allows pools to be shared across KenLM instances.
-   *
-   * @param sentId a key in the poolmap table to destroy
-   */
-  public void destroyPool(int sentId) {
-    if (poolMap.containsKey(sentId))
-      ((KenLM) languageModel).destroyLMPool(poolMap.get(sentId));
-    poolMap.remove(sentId);
-  }
-
-  /**
    * This function differs from regular transitions because we incorporate the cost of incomplete
    * left-hand ngrams, as well as including the start- and end-of-sentence markers (if they were
    * requested when the object was created).

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/0252942d/src/main/java/org/apache/joshua/decoder/segment_file/Sentence.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/segment_file/Sentence.java b/src/main/java/org/apache/joshua/decoder/segment_file/Sentence.java
index 7127870..f84c41a 100644
--- a/src/main/java/org/apache/joshua/decoder/segment_file/Sentence.java
+++ b/src/main/java/org/apache/joshua/decoder/segment_file/Sentence.java
@@ -21,16 +21,21 @@ package org.apache.joshua.decoder.segment_file;
 import static org.apache.joshua.util.FormatUtils.addSentenceMarkers;
 
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
+import java.util.Map;
 import java.util.StringTokenizer;
+import java.util.UUID;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 import org.apache.joshua.corpus.Vocabulary;
 import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.KenLMPool;
+import org.apache.joshua.decoder.LanguageModelStateManager;
 import org.apache.joshua.decoder.ff.tm.Grammar;
 import org.apache.joshua.lattice.Arc;
 import org.apache.joshua.lattice.Lattice;
@@ -77,6 +82,8 @@ public class Sentence {
   
   public JoshuaConfiguration config = null;
 
+  private LanguageModelStateManager stateManager = new LanguageModelStateManager();
+
   /**
    * Constructor. Receives a string representing the input sentence. This string may be a
    * string-encoded lattice or a plain text string for decoding.
@@ -447,4 +454,8 @@ public class Sentence {
   public Node<Token> getNode(int i) {
     return getLattice().getNode(i);
   }
+
+  public LanguageModelStateManager getStateManager() {
+    return stateManager;
+  }
 }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/0252942d/src/test/java/org/apache/joshua/system/KenLmTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/joshua/system/KenLmTest.java b/src/test/java/org/apache/joshua/system/KenLmTest.java
index aa396d2..003b5d9 100644
--- a/src/test/java/org/apache/joshua/system/KenLmTest.java
+++ b/src/test/java/org/apache/joshua/system/KenLmTest.java
@@ -19,6 +19,7 @@
 package org.apache.joshua.system;
 
 import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.KenLMPool;
 import org.apache.joshua.decoder.ff.lm.KenLM;
 import org.apache.joshua.util.io.KenLmTestUtil;
 import org.testng.annotations.AfterMethod;
@@ -29,8 +30,7 @@ import static org.apache.joshua.corpus.Vocabulary.registerLanguageModel;
 import static org.apache.joshua.corpus.Vocabulary.unregisterLanguageModels;
 import static org.hamcrest.MatcherAssert.assertThat;
 import static org.hamcrest.core.Is.is;
-import static org.mockito.Matchers.isNotNull;
-import static org.mockito.Matchers.notNull;
+import static org.hamcrest.core.IsNull.notNullValue;
 import static org.testng.Assert.assertTrue;
 import static org.testng.AssertJUnit.assertEquals;
 import static org.testng.AssertJUnit.assertFalse;
@@ -84,7 +84,7 @@ public class KenLmTest {
   }
 
   @Test
-  public void givenKenLm_whenQueryingForNgramProbability2_thenIdAndStringMethodsReturnTheSame() {
+  public void givenKenLm_whenQueryingWithState_thenStateAndProbReturned() {
     // GIVEN
     KenLmTestUtil.Guard(() -> kenLm = new KenLM(LANGUAGE_MODEL_PATH));
 
@@ -94,16 +94,18 @@ public class KenLmTest {
     int[] ids = Vocabulary.addAll(sentence);
     long[] longIds = new long[ids.length];
 
-    for(int i = 0; i< words.length; i++) {
+    for (int i = 0; i < words.length; i++) {
       longIds[i] = ids[i];
     }
 
     // WHEN
-    long poolPointer = kenLm.createLMPool();
-    KenLM.StateProbPair result = kenLm.probRule(longIds, poolPointer);
-    kenLm.destroyLMPool(poolPointer);
+    KenLM.StateProbPair result;
+    try (KenLMPool poolPointer = kenLm.createLMPool()) {
+      result = kenLm.probRule(longIds, poolPointer);
+    }
 
     // THEN
+    assertThat(result, is(notNullValue()));
     assertThat(result.state.getState(), is(0L));
     assertThat(result.prob, is(-3.7906885f));
   }


[7/7] incubator-joshua git commit: Merge branch 'kellen-kenlm_pool_fix'

Posted by mj...@apache.org.
Merge branch 'kellen-kenlm_pool_fix'


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/90fff5ab
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/90fff5ab
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/90fff5ab

Branch: refs/heads/master
Commit: 90fff5ab1de3da23c0f64f90e69ce0da2392fd49
Parents: 99e8f95 bdd670b
Author: Matt Post <po...@cs.jhu.edu>
Authored: Tue Sep 13 17:48:21 2016 +0200
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Tue Sep 13 17:48:21 2016 +0200

----------------------------------------------------------------------
 jni/kenlm_wrap.cc                               | 144 +++++++++----------
 .../org/apache/joshua/decoder/KenLMPool.java    |  42 ++++++
 .../decoder/LanguageModelStateManager.java      |  29 ++++
 .../org/apache/joshua/decoder/Translation.java  |  17 +--
 .../org/apache/joshua/decoder/ff/lm/KenLM.java  |  24 ++--
 .../ff/lm/StateMinimizingLanguageModel.java     |  30 +---
 .../joshua/decoder/segment_file/Sentence.java   |  11 ++
 .../org/apache/joshua/system/KenLmTest.java     |  30 ++++
 8 files changed, 196 insertions(+), 131 deletions(-)
----------------------------------------------------------------------



[5/7] incubator-joshua git commit: Moved resources, disabled test

Posted by mj...@apache.org.
Moved resources, disabled test


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/99e8f951
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/99e8f951
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/99e8f951

Branch: refs/heads/master
Commit: 99e8f951009213b55595c83cab2f0db3701c04ca
Parents: 49bbcac
Author: Matt Post <po...@cs.jhu.edu>
Authored: Tue Sep 13 16:32:15 2016 +0200
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Tue Sep 13 16:32:15 2016 +0200

----------------------------------------------------------------------
 .../decoder/phrase/constrained/corpus.es        |   1 -
 .../decoder/phrase/constrained/glue.grammar     |   3 --
 .../decoder/phrase/constrained/joshua.config    |  29 -----------------
 .../decoder/phrase/constrained/output.gold      |   5 ---
 .../decoder/phrase/constrained/test.sh          |  32 -------------------
 src/test/resources/decoder/phrase/decode/config |  29 -----------------
 .../decoder/phrase/decode/config.packed         |  29 -----------------
 .../resources/decoder/phrase/decode/corpus.es   |   1 -
 .../resources/decoder/phrase/decode/lm.1.gz     | Bin 2235 -> 0 bytes
 .../resources/decoder/phrase/decode/output.gold |   1 -
 .../resources/decoder/phrase/decode/rules.1.gz  | Bin 2998042 -> 0 bytes
 .../decoder/phrase/decode/rules.packed/config   |   2 --
 .../decoder/phrase/decode/rules.packed/encoding | Bin 87 -> 0 bytes
 .../decode/rules.packed/slice_00000.features    | Bin 4128858 -> 0 bytes
 .../decode/rules.packed/slice_00000.source      | Bin 1982228 -> 0 bytes
 .../decode/rules.packed/slice_00000.target      | Bin 1463856 -> 0 bytes
 .../rules.packed/slice_00000.target.lookup      | Bin 28 -> 0 bytes
 .../phrase/decode/rules.packed/vocabulary       | Bin 169225 -> 0 bytes
 .../decoder/phrase/decode/test-packed.sh        |  32 -------------------
 .../resources/decoder/phrase/decode/test.sh     |  17 ----------
 src/test/resources/phrase_decoder/config.packed |  29 +++++++++++++++++
 .../phrase_decoder/rules.packed/config          |   2 ++
 .../phrase_decoder/rules.packed/encoding        | Bin 0 -> 87 bytes
 .../rules.packed/slice_00000.features           | Bin 0 -> 4128858 bytes
 .../rules.packed/slice_00000.source             | Bin 0 -> 1982228 bytes
 .../rules.packed/slice_00000.target             | Bin 0 -> 1463856 bytes
 .../rules.packed/slice_00000.target.lookup      | Bin 0 -> 28 bytes
 .../phrase_decoder/rules.packed/vocabulary      | Bin 0 -> 169225 bytes
 28 files changed, 31 insertions(+), 181 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/constrained/corpus.es
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/constrained/corpus.es b/src/test/resources/decoder/phrase/constrained/corpus.es
deleted file mode 100644
index a063f9a..0000000
--- a/src/test/resources/decoder/phrase/constrained/corpus.es
+++ /dev/null
@@ -1 +0,0 @@
-una estrategia republicana para obstaculizar la reelecci�n de Obama ||| President Obama to hinder a strategy for Republican re @-@ election

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/constrained/glue.grammar
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/constrained/glue.grammar b/src/test/resources/decoder/phrase/constrained/glue.grammar
deleted file mode 100644
index 6a1162f..0000000
--- a/src/test/resources/decoder/phrase/constrained/glue.grammar
+++ /dev/null
@@ -1,3 +0,0 @@
-[GOAL] ||| <s> ||| <s> ||| 0
-[GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1
-[GOAL] ||| [GOAL,1] </s> ||| [GOAL,1] </s> ||| 0

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/constrained/joshua.config
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/constrained/joshua.config b/src/test/resources/decoder/phrase/constrained/joshua.config
deleted file mode 100644
index be45e0a..0000000
--- a/src/test/resources/decoder/phrase/constrained/joshua.config
+++ /dev/null
@@ -1,29 +0,0 @@
-tm = moses pt 0 ../decode/rules.1.gz
-
-lm = kenlm 5 true false 100 ../decode/lm.1.gz
-
-mark-oovs = false
-pop-limit = 10
-top-n = 5
-
-output-format = %i ||| %s ||| %f ||| %c
-
-include-align-index = false
-reordering-limit = 10
-use-unique-nbest = false
-
-# And these are the feature functions to activate.
-feature-function = OOVPenalty
-feature-function = WordPenalty
-feature-function = Distortion
-feature-function = PhrasePenalty -owner pt
-
-OOVPenalty 1.0
-Distortion 0.114849
-WordPenalty -0.201544
-PhrasePenalty -0.236965
-tm_pt_0 0.0370068
-tm_pt_1 0.0495759
-tm_pt_2 0.196742
-tm_pt_3 0.0745423
-lm_0 0.204412452147565

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/constrained/output.gold
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/constrained/output.gold b/src/test/resources/decoder/phrase/constrained/output.gold
deleted file mode 100644
index a784043..0000000
--- a/src/test/resources/decoder/phrase/constrained/output.gold
+++ /dev/null
@@ -1,5 +0,0 @@
-0 ||| President Obama to hinder a strategy for Republican re @-@ election ||| tm_pt_0=-15.792 tm_pt_1=-17.550 tm_pt_2=-14.599 tm_pt_3=-18.298 lm_0=-29.452 OOVPenalty=0.000 WordPenalty=-4.777 Distortion=-24.000 PhrasePenalty=7.000 ||| -15.163
-0 ||| President Obama to hinder a strategy for Republican re @-@ election ||| tm_pt_0=-16.919 tm_pt_1=-17.550 tm_pt_2=-14.917 tm_pt_3=-18.298 lm_0=-29.452 OOVPenalty=0.000 WordPenalty=-4.777 Distortion=-24.000 PhrasePenalty=8.000 ||| -15.505
-0 ||| President Obama to hinder a strategy for Republican re @-@ election ||| tm_pt_0=-14.986 tm_pt_1=-17.951 tm_pt_2=-14.075 tm_pt_3=-18.699 lm_0=-29.452 OOVPenalty=0.000 WordPenalty=-4.777 Distortion=-32.000 PhrasePenalty=6.000 ||| -15.762
-0 ||| President Obama to hinder a strategy for Republican re @-@ election ||| tm_pt_0=-16.112 tm_pt_1=-17.951 tm_pt_2=-14.393 tm_pt_3=-18.699 lm_0=-29.452 OOVPenalty=0.000 WordPenalty=-4.777 Distortion=-32.000 PhrasePenalty=7.000 ||| -16.103
-0 ||| President Obama to hinder a strategy for Republican re @-@ election ||| tm_pt_0=-16.329 tm_pt_1=-17.951 tm_pt_2=-15.136 tm_pt_3=-18.699 lm_0=-29.452 OOVPenalty=0.000 WordPenalty=-4.777 Distortion=-32.000 PhrasePenalty=7.000 ||| -16.257

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/constrained/test.sh
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/constrained/test.sh b/src/test/resources/decoder/phrase/constrained/test.sh
deleted file mode 100755
index 6bef145..0000000
--- a/src/test/resources/decoder/phrase/constrained/test.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-set -u
-
-cat corpus.es | $JOSHUA/bin/joshua-decoder -threads 1 -c joshua.config > output 2> log
-
-# Compare
-diff -u output output.gold > diff
-
-if [ $? -eq 0 ]; then
-  rm -f diff output log
-  exit 0
-else
-  exit 1
-fi
-
-

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/config
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/config b/src/test/resources/decoder/phrase/decode/config
deleted file mode 100644
index 7e6f2b8..0000000
--- a/src/test/resources/decoder/phrase/decode/config
+++ /dev/null
@@ -1,29 +0,0 @@
-tm = moses -owner pt -maxspan 0 -path rules.1.gz -max-source-len 5
-feature-function = StateMinimizingLanguageModel -lm_order 5 -lm_file lm.1.gz
-
-search = stack
-
-mark-oovs = false
-pop-limit = 10
-top-n = 1
-
-output-format = %i ||| %s ||| %f ||| %c
-
-include-align-index = false
-reordering-limit = 6
-
-# And these are the feature functions to activate.
-feature-function = OOVPenalty
-feature-function = WordPenalty
-feature-function = Distortion
-feature-function = PhrasePenalty -owner pt
-
-OOVPenalty 1.0
-Distortion 0.114849
-WordPenalty -0.201544
-PhrasePenalty -0.236965
-tm_pt_0 0.0370068
-tm_pt_1 0.0495759
-tm_pt_2 0.196742
-tm_pt_3 0.0745423
-lm_0 0.204412452147565

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/config.packed
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/config.packed b/src/test/resources/decoder/phrase/decode/config.packed
deleted file mode 100644
index 9987b1a..0000000
--- a/src/test/resources/decoder/phrase/decode/config.packed
+++ /dev/null
@@ -1,29 +0,0 @@
-tm = moses -owner pt -maxspan 0 -path rules.packed -max-source-len 5
-feature-function = StateMinimizingLanguageModel -lm_order 5 -lm_file lm.1.gz
-
-search = stack
-
-mark-oovs = false
-pop-limit = 10
-top-n = 1
-
-output-format = %i ||| %s ||| %f ||| %c
-
-include-align-index = false
-reordering-limit = 6
-
-# And these are the feature functions to activate.
-feature-function = OOVPenalty
-feature-function = WordPenalty
-feature-function = Distortion
-feature-function = PhrasePenalty -owner pt
-
-OOVPenalty 1.0
-Distortion 0.114849
-WordPenalty -0.201544
-PhrasePenalty -0.236965
-tm_pt_0 0.0370068
-tm_pt_1 0.0495759
-tm_pt_2 0.196742
-tm_pt_3 0.0745423
-lm_0 0.204412452147565

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/corpus.es
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/corpus.es b/src/test/resources/decoder/phrase/decode/corpus.es
deleted file mode 100644
index 6e255f9..0000000
--- a/src/test/resources/decoder/phrase/decode/corpus.es
+++ /dev/null
@@ -1 +0,0 @@
-una estrategia republicana para obstaculizar la reelecci�n de Obama 

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/lm.1.gz
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/lm.1.gz b/src/test/resources/decoder/phrase/decode/lm.1.gz
deleted file mode 100644
index 3f4c453..0000000
Binary files a/src/test/resources/decoder/phrase/decode/lm.1.gz and /dev/null differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/output.gold
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/output.gold b/src/test/resources/decoder/phrase/decode/output.gold
deleted file mode 100644
index 0083345..0000000
--- a/src/test/resources/decoder/phrase/decode/output.gold
+++ /dev/null
@@ -1 +0,0 @@
-0 ||| a strategy republican to hinder reelection Obama ||| tm_pt_0=-9.702 tm_pt_1=-10.800 tm_pt_2=-7.543 tm_pt_3=-8.555 lm_0=-19.117 OOVPenalty=0.000 WordPenalty=-3.040 Distortion=0.000 PhrasePenalty=5.000 ||| -7.496

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/rules.1.gz
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/rules.1.gz b/src/test/resources/decoder/phrase/decode/rules.1.gz
deleted file mode 100644
index 14466e9..0000000
Binary files a/src/test/resources/decoder/phrase/decode/rules.1.gz and /dev/null differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/rules.packed/config
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/rules.packed/config b/src/test/resources/decoder/phrase/decode/rules.packed/config
deleted file mode 100644
index 2251fe6..0000000
--- a/src/test/resources/decoder/phrase/decode/rules.packed/config
+++ /dev/null
@@ -1,2 +0,0 @@
-version = 4
-max-source-len = 3

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/rules.packed/encoding
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/rules.packed/encoding b/src/test/resources/decoder/phrase/decode/rules.packed/encoding
deleted file mode 100644
index 57e7b75..0000000
Binary files a/src/test/resources/decoder/phrase/decode/rules.packed/encoding and /dev/null differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.features
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.features b/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.features
deleted file mode 100644
index 2a77e43..0000000
Binary files a/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.features and /dev/null differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.source
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.source b/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.source
deleted file mode 100644
index c384c54..0000000
Binary files a/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.source and /dev/null differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.target
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.target b/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.target
deleted file mode 100644
index 8375cf0..0000000
Binary files a/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.target and /dev/null differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.target.lookup
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.target.lookup b/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.target.lookup
deleted file mode 100644
index 3e8c294..0000000
Binary files a/src/test/resources/decoder/phrase/decode/rules.packed/slice_00000.target.lookup and /dev/null differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/rules.packed/vocabulary
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/rules.packed/vocabulary b/src/test/resources/decoder/phrase/decode/rules.packed/vocabulary
deleted file mode 100644
index 528a970..0000000
Binary files a/src/test/resources/decoder/phrase/decode/rules.packed/vocabulary and /dev/null differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/test-packed.sh
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/test-packed.sh b/src/test/resources/decoder/phrase/decode/test-packed.sh
deleted file mode 100755
index a65c031..0000000
--- a/src/test/resources/decoder/phrase/decode/test-packed.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-set -u
-
-cat corpus.es | $JOSHUA/bin/joshua-decoder -threads 1 -c config.packed > output 2> log
-
-# Compare
-diff -u output output.gold > diff
-
-if [ $? -eq 0 ]; then
-  rm -f diff output log
-  exit 0
-else
-  exit 1
-fi
-
-

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/decoder/phrase/decode/test.sh
----------------------------------------------------------------------
diff --git a/src/test/resources/decoder/phrase/decode/test.sh b/src/test/resources/decoder/phrase/decode/test.sh
deleted file mode 100755
index 4732f73..0000000
--- a/src/test/resources/decoder/phrase/decode/test.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-
-set -u
-
-cat corpus.es | $JOSHUA/bin/joshua-decoder -threads 1 -c config > output 2> log
-
-# Compare
-diff -u output output.gold > diff
-
-if [ $? -eq 0 ]; then
-  rm -f diff output log
-  exit 0
-else
-  exit 1
-fi
-
-

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/phrase_decoder/config.packed
----------------------------------------------------------------------
diff --git a/src/test/resources/phrase_decoder/config.packed b/src/test/resources/phrase_decoder/config.packed
new file mode 100644
index 0000000..9987b1a
--- /dev/null
+++ b/src/test/resources/phrase_decoder/config.packed
@@ -0,0 +1,29 @@
+tm = moses -owner pt -maxspan 0 -path rules.packed -max-source-len 5
+feature-function = StateMinimizingLanguageModel -lm_order 5 -lm_file lm.1.gz
+
+search = stack
+
+mark-oovs = false
+pop-limit = 10
+top-n = 1
+
+output-format = %i ||| %s ||| %f ||| %c
+
+include-align-index = false
+reordering-limit = 6
+
+# And these are the feature functions to activate.
+feature-function = OOVPenalty
+feature-function = WordPenalty
+feature-function = Distortion
+feature-function = PhrasePenalty -owner pt
+
+OOVPenalty 1.0
+Distortion 0.114849
+WordPenalty -0.201544
+PhrasePenalty -0.236965
+tm_pt_0 0.0370068
+tm_pt_1 0.0495759
+tm_pt_2 0.196742
+tm_pt_3 0.0745423
+lm_0 0.204412452147565

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/phrase_decoder/rules.packed/config
----------------------------------------------------------------------
diff --git a/src/test/resources/phrase_decoder/rules.packed/config b/src/test/resources/phrase_decoder/rules.packed/config
new file mode 100644
index 0000000..2251fe6
--- /dev/null
+++ b/src/test/resources/phrase_decoder/rules.packed/config
@@ -0,0 +1,2 @@
+version = 4
+max-source-len = 3

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/phrase_decoder/rules.packed/encoding
----------------------------------------------------------------------
diff --git a/src/test/resources/phrase_decoder/rules.packed/encoding b/src/test/resources/phrase_decoder/rules.packed/encoding
new file mode 100644
index 0000000..57e7b75
Binary files /dev/null and b/src/test/resources/phrase_decoder/rules.packed/encoding differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/phrase_decoder/rules.packed/slice_00000.features
----------------------------------------------------------------------
diff --git a/src/test/resources/phrase_decoder/rules.packed/slice_00000.features b/src/test/resources/phrase_decoder/rules.packed/slice_00000.features
new file mode 100644
index 0000000..2a77e43
Binary files /dev/null and b/src/test/resources/phrase_decoder/rules.packed/slice_00000.features differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/phrase_decoder/rules.packed/slice_00000.source
----------------------------------------------------------------------
diff --git a/src/test/resources/phrase_decoder/rules.packed/slice_00000.source b/src/test/resources/phrase_decoder/rules.packed/slice_00000.source
new file mode 100644
index 0000000..c384c54
Binary files /dev/null and b/src/test/resources/phrase_decoder/rules.packed/slice_00000.source differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/phrase_decoder/rules.packed/slice_00000.target
----------------------------------------------------------------------
diff --git a/src/test/resources/phrase_decoder/rules.packed/slice_00000.target b/src/test/resources/phrase_decoder/rules.packed/slice_00000.target
new file mode 100644
index 0000000..8375cf0
Binary files /dev/null and b/src/test/resources/phrase_decoder/rules.packed/slice_00000.target differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/phrase_decoder/rules.packed/slice_00000.target.lookup
----------------------------------------------------------------------
diff --git a/src/test/resources/phrase_decoder/rules.packed/slice_00000.target.lookup b/src/test/resources/phrase_decoder/rules.packed/slice_00000.target.lookup
new file mode 100644
index 0000000..3e8c294
Binary files /dev/null and b/src/test/resources/phrase_decoder/rules.packed/slice_00000.target.lookup differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/99e8f951/src/test/resources/phrase_decoder/rules.packed/vocabulary
----------------------------------------------------------------------
diff --git a/src/test/resources/phrase_decoder/rules.packed/vocabulary b/src/test/resources/phrase_decoder/rules.packed/vocabulary
new file mode 100644
index 0000000..528a970
Binary files /dev/null and b/src/test/resources/phrase_decoder/rules.packed/vocabulary differ


[3/7] incubator-joshua git commit: Adapted Java side of JNI interface to get state and prob from packed long

Posted by mj...@apache.org.
Adapted Java side of JNI interface to get state and prob from packed long


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/4e07bb66
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/4e07bb66
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/4e07bb66

Branch: refs/heads/master
Commit: 4e07bb66d28e55357ee6b19b3c60a76a31d8dd75
Parents: 929760a
Author: Kellen Sunderland <ke...@amazon.com>
Authored: Tue Sep 13 12:39:41 2016 +0200
Committer: Kellen Sunderland <ke...@amazon.com>
Committed: Tue Sep 13 12:39:41 2016 +0200

----------------------------------------------------------------------
 jni/kenlm_wrap.cc                               |  9 +++---
 .../org/apache/joshua/decoder/ff/lm/KenLM.java  |  7 +++--
 .../org/apache/joshua/system/KenLmTest.java     | 29 ++++++++++++++++++++
 3 files changed, 39 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/4e07bb66/jni/kenlm_wrap.cc
----------------------------------------------------------------------
diff --git a/jni/kenlm_wrap.cc b/jni/kenlm_wrap.cc
index 8f69e19..bbe6e7c 100644
--- a/jni/kenlm_wrap.cc
+++ b/jni/kenlm_wrap.cc
@@ -95,7 +95,7 @@ class Chart {
       return *ins.first;
     }
 
-    const ChartState &InterpretState(StateIndex index) const {
+    const lm::ngram::ChartState &InterpretState(StateIndex index) const {
       return vec_[index];
     }
 
@@ -201,7 +201,7 @@ public:
     lm::ngram::RuleScore<Model> ruleScore(m_, state);
 
     if (*begin < 0) {
-      ruleScore.BeginNonTerminal(chart.Interpet(-*begin));
+      ruleScore.BeginNonTerminal(chart.InterpretState(-*begin));
     } else {
       const lm::WordIndex word = map_[*begin];
       if (word == m_.GetVocabulary().BeginSentence()) {
@@ -213,7 +213,7 @@ public:
     for (jlong* i = begin + 1; i != end; i++) {
       long word = *i;
       if (word < 0)
-        ruleScore.NonTerminal(chart.Interpret(-word));
+        ruleScore.NonTerminal(chart.InterpretState(-word));
       else
         ruleScore.Terminal(map_[word]);
     }
@@ -449,6 +449,7 @@ union FloatConverter {
 
 JNIEXPORT jlong JNICALL Java_org_apache_joshua_decoder_ff_lm_KenLM_probRule(
   JNIEnv *env, jclass, jlong pointer, jlong chartPtr, jlongArray arr) {
+
   jint length = env->GetArrayLength(arr);
   // GCC only.
   jlong values[length];
@@ -458,7 +459,7 @@ JNIEXPORT jlong JNICALL Java_org_apache_joshua_decoder_ff_lm_KenLM_probRule(
   lm::ngram::ChartState outState;
   const VirtualBase *base = reinterpret_cast<const VirtualBase*>(pointer);
   Chart* chart = reinterpret_cast<Chart*>(chartPtr);
-  FloatConvert prob;
+  FloatConverter prob;
   prob.f = base->ProbRule(values, values + length, outState, *chart);
 
   StateIndex index = chart->Intern(outState);

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/4e07bb66/src/main/java/org/apache/joshua/decoder/ff/lm/KenLM.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/lm/KenLM.java b/src/main/java/org/apache/joshua/decoder/ff/lm/KenLM.java
index 044c85f..b0a1117 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/lm/KenLM.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/lm/KenLM.java
@@ -61,7 +61,7 @@ public class KenLM implements NGramLanguageModel, Comparable<KenLM> {
 
   private static native boolean isLmOov(long ptr, int word);
 
-  private static native StateProbPair probRule(long ptr, long pool, long words[]);
+  private static native long probRule(long ptr, long pool, long words[]);
 
   private static native float estimateRule(long ptr, long words[]);
 
@@ -161,7 +161,10 @@ public class KenLM implements NGramLanguageModel, Comparable<KenLM> {
 
     StateProbPair pair = null;
     try {
-      pair = probRule(pointer, poolPointer, words);
+      long packedResult = probRule(pointer, poolPointer, words);
+      int state = (int) (packedResult >> 32);
+      float probVal = Float.intBitsToFloat((int)packedResult);
+      pair = new StateProbPair(state, probVal);
     } catch (NoSuchMethodError e) {
       e.printStackTrace();
       System.exit(1);

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/4e07bb66/src/test/java/org/apache/joshua/system/KenLmTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/joshua/system/KenLmTest.java b/src/test/java/org/apache/joshua/system/KenLmTest.java
index 74baef3..aa396d2 100644
--- a/src/test/java/org/apache/joshua/system/KenLmTest.java
+++ b/src/test/java/org/apache/joshua/system/KenLmTest.java
@@ -27,6 +27,10 @@ import org.testng.annotations.Test;
 
 import static org.apache.joshua.corpus.Vocabulary.registerLanguageModel;
 import static org.apache.joshua.corpus.Vocabulary.unregisterLanguageModels;
+import static org.hamcrest.MatcherAssert.assertThat;
+import static org.hamcrest.core.Is.is;
+import static org.mockito.Matchers.isNotNull;
+import static org.mockito.Matchers.notNull;
 import static org.testng.Assert.assertTrue;
 import static org.testng.AssertJUnit.assertEquals;
 import static org.testng.AssertJUnit.assertFalse;
@@ -80,6 +84,31 @@ public class KenLmTest {
   }
 
   @Test
+  public void givenKenLm_whenQueryingForNgramProbability2_thenIdAndStringMethodsReturnTheSame() {
+    // GIVEN
+    KenLmTestUtil.Guard(() -> kenLm = new KenLM(LANGUAGE_MODEL_PATH));
+
+    registerLanguageModel(kenLm);
+    String sentence = "Wayne Gretzky";
+    String[] words = sentence.split("\\s+");
+    int[] ids = Vocabulary.addAll(sentence);
+    long[] longIds = new long[ids.length];
+
+    for(int i = 0; i< words.length; i++) {
+      longIds[i] = ids[i];
+    }
+
+    // WHEN
+    long poolPointer = kenLm.createLMPool();
+    KenLM.StateProbPair result = kenLm.probRule(longIds, poolPointer);
+    kenLm.destroyLMPool(poolPointer);
+
+    // THEN
+    assertThat(result.state.getState(), is(0L));
+    assertThat(result.prob, is(-3.7906885f));
+  }
+
+  @Test
   public void givenKenLm_whenIsKnownWord_thenReturnValuesAreCorrect() {
     KenLmTestUtil.Guard(() -> kenLm = new KenLM(LANGUAGE_MODEL_PATH));
     assertTrue(kenLm.isKnownWord("Wayne"));