You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/19 21:34:02 UTC

[14/51] [partial] incubator-joshua git commit: Converted KenLM into a submodule

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/initial_probabilities.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/builder/initial_probabilities.cc b/ext/kenlm/lm/builder/initial_probabilities.cc
deleted file mode 100644
index 5b8d86d..0000000
--- a/ext/kenlm/lm/builder/initial_probabilities.cc
+++ /dev/null
@@ -1,306 +0,0 @@
-#include "lm/builder/initial_probabilities.hh"
-
-#include "lm/builder/discount.hh"
-#include "lm/builder/hash_gamma.hh"
-#include "lm/builder/payload.hh"
-#include "lm/common/special.hh"
-#include "lm/common/ngram_stream.hh"
-#include "util/murmur_hash.hh"
-#include "util/file.hh"
-#include "util/stream/chain.hh"
-#include "util/stream/io.hh"
-#include "util/stream/stream.hh"
-
-#include <vector>
-
-namespace lm { namespace builder {
-
-namespace {
-struct BufferEntry {
-  // Gamma from page 20 of Chen and Goodman.
-  float gamma;
-  // \sum_w a(c w) for all w.
-  float denominator;
-};
-
-struct HashBufferEntry : public BufferEntry {
-  // Hash value of ngram. Used to join contexts with backoffs.
-  uint64_t hash_value;
-};
-
-// Reads all entries in order like NGramStream does.
-// But deletes any entries that have CutoffCount below or equal to pruning
-// threshold.
-class PruneNGramStream {
-  public:
-    PruneNGramStream(const util::stream::ChainPosition &position, const SpecialVocab &specials) :
-      current_(NULL, NGram<BuildingPayload>::OrderFromSize(position.GetChain().EntrySize())),
-      dest_(NULL, NGram<BuildingPayload>::OrderFromSize(position.GetChain().EntrySize())),
-      currentCount_(0),
-      block_(position),
-      specials_(specials)
-    {
-      StartBlock();
-    }
-
-    NGram<BuildingPayload> &operator*() { return current_; }
-    NGram<BuildingPayload> *operator->() { return &current_; }
-
-    operator bool() const {
-      return block_;
-    }
-
-    PruneNGramStream &operator++() {
-      assert(block_);
-      if(UTIL_UNLIKELY(current_.Order() == 1 && specials_.IsSpecial(*current_.begin())))
-        dest_.NextInMemory();
-      else if(currentCount_ > 0) {
-        if(dest_.Base() < current_.Base()) {
-          memcpy(dest_.Base(), current_.Base(), current_.TotalSize());
-        }
-        dest_.NextInMemory();
-      }
-
-      current_.NextInMemory();
-
-      uint8_t *block_base = static_cast<uint8_t*>(block_->Get());
-      if (current_.Base() == block_base + block_->ValidSize()) {
-        block_->SetValidSize(dest_.Base() - block_base);
-        ++block_;
-        StartBlock();
-        if (block_) {
-          currentCount_ = current_.Value().CutoffCount();
-        }
-      } else {
-        currentCount_ = current_.Value().CutoffCount();
-      }
-
-      return *this;
-    }
-
-  private:
-    void StartBlock() {
-      for (; ; ++block_) {
-        if (!block_) return;
-        if (block_->ValidSize()) break;
-      }
-      current_.ReBase(block_->Get());
-      currentCount_ = current_.Value().CutoffCount();
-
-      dest_.ReBase(block_->Get());
-    }
-
-    NGram<BuildingPayload> current_; // input iterator
-    NGram<BuildingPayload> dest_;    // output iterator
-
-    uint64_t currentCount_;
-
-    util::stream::Link block_;
-
-    const SpecialVocab specials_;
-};
-
-// Extract an array of HashedGamma from an array of BufferEntry.
-class OnlyGamma {
-  public:
-    explicit OnlyGamma(bool pruning) : pruning_(pruning) {}
-
-    void Run(const util::stream::ChainPosition &position) {
-      for (util::stream::Link block_it(position); block_it; ++block_it) {
-        if(pruning_) {
-          const HashBufferEntry *in = static_cast<const HashBufferEntry*>(block_it->Get());
-          const HashBufferEntry *end = static_cast<const HashBufferEntry*>(block_it->ValidEnd());
-
-          // Just make it point to the beginning of the stream so it can be overwritten
-          // With HashGamma values. Do not attempt to interpret the values until set below.
-          HashGamma *out = static_cast<HashGamma*>(block_it->Get());
-          for (; in < end; out += 1, in += 1) {
-            // buffering, otherwise might overwrite values too early
-            float gamma_buf = in->gamma;
-            uint64_t hash_buf = in->hash_value;
-
-            out->gamma = gamma_buf;
-            out->hash_value = hash_buf;
-          }
-          block_it->SetValidSize((block_it->ValidSize() * sizeof(HashGamma)) / sizeof(HashBufferEntry));
-        }
-        else {
-          float *out = static_cast<float*>(block_it->Get());
-          const float *in = out;
-          const float *end = static_cast<const float*>(block_it->ValidEnd());
-          for (out += 1, in += 2; in < end; out += 1, in += 2) {
-            *out = *in;
-          }
-          block_it->SetValidSize(block_it->ValidSize() / 2);
-        }
-      }
-    }
-
-    private:
-      bool pruning_;
-};
-
-class AddRight {
-  public:
-    AddRight(const Discount &discount, const util::stream::ChainPosition &input, bool pruning)
-      : discount_(discount), input_(input), pruning_(pruning) {}
-
-    void Run(const util::stream::ChainPosition &output) {
-      NGramStream<BuildingPayload> in(input_);
-      util::stream::Stream out(output);
-
-      std::vector<WordIndex> previous(in->Order() - 1);
-      // Silly windows requires this workaround to just get an invalid pointer when empty.
-      void *const previous_raw = previous.empty() ? NULL : static_cast<void*>(&previous[0]);
-      const std::size_t size = sizeof(WordIndex) * previous.size();
-
-      for(; in; ++out) {
-        memcpy(previous_raw, in->begin(), size);
-        uint64_t denominator = 0;
-        uint64_t normalizer = 0;
-
-        uint64_t counts[4];
-        memset(counts, 0, sizeof(counts));
-        do {
-          denominator += in->Value().UnmarkedCount();
-
-          // Collect unused probability mass from pruning.
-          // Becomes 0 for unpruned ngrams.
-          normalizer += in->Value().UnmarkedCount() - in->Value().CutoffCount();
-
-          // Chen&Goodman do not mention counting based on cutoffs, but
-          // backoff becomes larger than 1 otherwise, so probably needs
-          // to count cutoffs. Counts normally without pruning.
-          if(in->Value().CutoffCount() > 0)
-            ++counts[std::min(in->Value().CutoffCount(), static_cast<uint64_t>(3))];
-
-        } while (++in && !memcmp(previous_raw, in->begin(), size));
-
-        BufferEntry &entry = *reinterpret_cast<BufferEntry*>(out.Get());
-        entry.denominator = static_cast<float>(denominator);
-        entry.gamma = 0.0;
-        for (unsigned i = 1; i <= 3; ++i) {
-          entry.gamma += discount_.Get(i) * static_cast<float>(counts[i]);
-        }
-
-        // Makes model sum to 1 with pruning (I hope).
-        entry.gamma += normalizer;
-
-        entry.gamma /= entry.denominator;
-
-        if(pruning_) {
-          // If pruning is enabled the stream actually contains HashBufferEntry, see InitialProbabilities(...),
-          // so add a hash value that identifies the current ngram.
-          static_cast<HashBufferEntry*>(&entry)->hash_value = util::MurmurHashNative(previous_raw, size);
-        }
-      }
-      out.Poison();
-    }
-
-  private:
-    const Discount &discount_;
-    const util::stream::ChainPosition input_;
-    bool pruning_;
-};
-
-class MergeRight {
-  public:
-    MergeRight(bool interpolate_unigrams, const util::stream::ChainPosition &from_adder, const Discount &discount, const SpecialVocab &specials)
-      : interpolate_unigrams_(interpolate_unigrams), from_adder_(from_adder), discount_(discount), specials_(specials) {}
-
-    // calculate the initial probability of each n-gram (before order-interpolation)
-    // Run() gets invoked once for each order
-    void Run(const util::stream::ChainPosition &primary) {
-      util::stream::Stream summed(from_adder_);
-
-      PruneNGramStream grams(primary, specials_);
-
-      // Without interpolation, the interpolation weight goes to <unk>.
-      if (grams->Order() == 1) {
-        BufferEntry sums(*static_cast<const BufferEntry*>(summed.Get()));
-        // Special case for <unk>
-        assert(*grams->begin() == kUNK);
-        float gamma_assign;
-        if (interpolate_unigrams_) {
-          // Default: treat <unk> like a zeroton.
-          gamma_assign = sums.gamma;
-          grams->Value().uninterp.prob = 0.0;
-        } else {
-          // SRI: give all the interpolation mass to <unk>
-          gamma_assign = 0.0;
-          grams->Value().uninterp.prob = sums.gamma;
-        }
-        grams->Value().uninterp.gamma = gamma_assign;
-
-        for (++grams; *grams->begin() != specials_.BOS(); ++grams) {
-          grams->Value().uninterp.prob = discount_.Apply(grams->Value().count) / sums.denominator;
-          grams->Value().uninterp.gamma = gamma_assign;
-        }
-
-        // Special case for <s>: probability 1.0.  This allows <s> to be
-        // explicitly scored as part of the sentence without impacting
-        // probability and computes q correctly as b(<s>).
-        assert(*grams->begin() == specials_.BOS());
-        grams->Value().uninterp.prob = 1.0;
-        grams->Value().uninterp.gamma = 0.0;
-
-        while (++grams) {
-          grams->Value().uninterp.prob = discount_.Apply(grams->Value().count) / sums.denominator;
-          grams->Value().uninterp.gamma = gamma_assign;
-        }
-        ++summed;
-        return;
-      }
-
-      std::vector<WordIndex> previous(grams->Order() - 1);
-      const std::size_t size = sizeof(WordIndex) * previous.size();
-      for (; grams; ++summed) {
-        memcpy(&previous[0], grams->begin(), size);
-        const BufferEntry &sums = *static_cast<const BufferEntry*>(summed.Get());
-
-        do {
-          BuildingPayload &pay = grams->Value();
-          pay.uninterp.prob = discount_.Apply(grams->Value().UnmarkedCount()) / sums.denominator;
-          pay.uninterp.gamma = sums.gamma;
-        } while (++grams && !memcmp(&previous[0], grams->begin(), size));
-      }
-    }
-
-  private:
-    bool interpolate_unigrams_;
-    util::stream::ChainPosition from_adder_;
-    Discount discount_;
-    const SpecialVocab specials_;
-};
-
-} // namespace
-
-void InitialProbabilities(
-    const InitialProbabilitiesConfig &config,
-    const std::vector<Discount> &discounts,
-    util::stream::Chains &primary,
-    util::stream::Chains &second_in,
-    util::stream::Chains &gamma_out,
-    const std::vector<uint64_t> &prune_thresholds,
-    bool prune_vocab,
-    const SpecialVocab &specials) {
-  for (size_t i = 0; i < primary.size(); ++i) {
-    util::stream::ChainConfig gamma_config = config.adder_out;
-    if(prune_vocab || prune_thresholds[i] > 0)
-      gamma_config.entry_size = sizeof(HashBufferEntry);
-    else
-      gamma_config.entry_size = sizeof(BufferEntry);
-
-    util::stream::ChainPosition second(second_in[i].Add());
-    second_in[i] >> util::stream::kRecycle;
-    gamma_out.push_back(gamma_config);
-    gamma_out[i] >> AddRight(discounts[i], second, prune_vocab || prune_thresholds[i] > 0);
-
-    primary[i] >> MergeRight(config.interpolate_unigrams, gamma_out[i].Add(), discounts[i], specials);
-
-    // Don't bother with the OnlyGamma thread for something to discard.
-    if (i) gamma_out[i] >> OnlyGamma(prune_vocab || prune_thresholds[i] > 0);
-  }
-}
-
-}} // namespaces

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/initial_probabilities.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/builder/initial_probabilities.hh b/ext/kenlm/lm/builder/initial_probabilities.hh
deleted file mode 100644
index caeea58..0000000
--- a/ext/kenlm/lm/builder/initial_probabilities.hh
+++ /dev/null
@@ -1,45 +0,0 @@
-#ifndef LM_BUILDER_INITIAL_PROBABILITIES_H
-#define LM_BUILDER_INITIAL_PROBABILITIES_H
-
-#include "lm/builder/discount.hh"
-#include "lm/word_index.hh"
-#include "util/stream/config.hh"
-
-#include <vector>
-
-namespace util { namespace stream { class Chains; } }
-
-namespace lm {
-class SpecialVocab;
-namespace builder {
-
-struct InitialProbabilitiesConfig {
-  // These should be small buffers to keep the adder from getting too far ahead
-  util::stream::ChainConfig adder_in;
-  util::stream::ChainConfig adder_out;
-  // SRILM doesn't normally interpolate unigrams.
-  bool interpolate_unigrams;
-};
-
-/* Compute initial (uninterpolated) probabilities
- * primary: the normal chain of n-grams.  Incoming is context sorted adjusted
- *   counts.  Outgoing has uninterpolated probabilities for use by Interpolate.
- * second_in: a second copy of the primary input.  Discard the output.
- * gamma_out: Computed gamma values are output on these chains in suffix order.
- *   The values are bare floats and should be buffered for interpolation to
- *   use.
- */
-void InitialProbabilities(
-    const InitialProbabilitiesConfig &config,
-    const std::vector<Discount> &discounts,
-    util::stream::Chains &primary,
-    util::stream::Chains &second_in,
-    util::stream::Chains &gamma_out,
-    const std::vector<uint64_t> &prune_thresholds,
-    bool prune_vocab,
-    const SpecialVocab &vocab);
-
-} // namespace builder
-} // namespace lm
-
-#endif // LM_BUILDER_INITIAL_PROBABILITIES_H

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/interpolate.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/builder/interpolate.cc b/ext/kenlm/lm/builder/interpolate.cc
deleted file mode 100644
index a62ef43..0000000
--- a/ext/kenlm/lm/builder/interpolate.cc
+++ /dev/null
@@ -1,166 +0,0 @@
-#include "lm/builder/interpolate.hh"
-
-#include "lm/builder/hash_gamma.hh"
-#include "lm/builder/payload.hh"
-#include "lm/common/compare.hh"
-#include "lm/common/joint_order.hh"
-#include "lm/common/ngram_stream.hh"
-#include "lm/lm_exception.hh"
-#include "util/fixed_array.hh"
-#include "util/murmur_hash.hh"
-
-#include <iostream>
-#include <cassert>
-#include <cmath>
-
-namespace lm { namespace builder {
-namespace {
-
-/* Calculate q, the collapsed probability and backoff, as defined in
- * @inproceedings{Heafield-rest,
- *   author = {Kenneth Heafield and Philipp Koehn and Alon Lavie},
- *   title = {Language Model Rest Costs and Space-Efficient Storage},
- *   year = {2012},
- *   month = {July},
- *   booktitle = {Proceedings of the Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning},
- *   address = {Jeju Island, Korea},
- *   pages = {1169--1178},
- *   url = {http://kheafield.com/professional/edinburgh/rest\_paper.pdf},
- * }
- * This is particularly convenient to calculate during interpolation because
- * the needed backoff terms are already accessed at the same time.
- */
-class OutputQ {
-  public:
-    explicit OutputQ(std::size_t order) : q_delta_(order) {}
-
-    void Gram(unsigned order_minus_1, float full_backoff, ProbBackoff &out) {
-      float &q_del = q_delta_[order_minus_1];
-      if (order_minus_1) {
-        // Divide by context's backoff (which comes in as out.backoff)
-        q_del = q_delta_[order_minus_1 - 1] / out.backoff * full_backoff;
-      } else {
-        q_del = full_backoff;
-      }
-      out.prob = log10f(out.prob * q_del);
-      // TODO: stop wastefully outputting this!
-      out.backoff = 0.0;
-    }
-
-  private:
-    // Product of backoffs in the numerator divided by backoffs in the
-    // denominator.  Does not include
-    std::vector<float> q_delta_;
-};
-
-/* Default: output probability and backoff */
-class OutputProbBackoff {
-  public:
-    explicit OutputProbBackoff(std::size_t /*order*/) {}
-
-    void Gram(unsigned /*order_minus_1*/, float full_backoff, ProbBackoff &out) const {
-      // Correcting for numerical precision issues.  Take that IRST.
-      out.prob = std::min(0.0f, log10f(out.prob));
-      out.backoff = log10f(full_backoff);
-    }
-};
-
-template <class Output> class Callback {
-  public:
-    Callback(float uniform_prob, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t> &prune_thresholds, bool prune_vocab, const SpecialVocab &specials)
-      : backoffs_(backoffs.size()), probs_(backoffs.size() + 2),
-        prune_thresholds_(prune_thresholds),
-        prune_vocab_(prune_vocab),
-        output_(backoffs.size() + 1 /* order */),
-        specials_(specials) {
-      probs_[0] = uniform_prob;
-      for (std::size_t i = 0; i < backoffs.size(); ++i) {
-        backoffs_.push_back(backoffs[i]);
-      }
-    }
-
-    ~Callback() {
-      for (std::size_t i = 0; i < backoffs_.size(); ++i) {
-        if(prune_vocab_ || prune_thresholds_[i + 1] > 0)
-          while(backoffs_[i])
-            ++backoffs_[i];
-
-        if (backoffs_[i]) {
-          std::cerr << "Backoffs do not match for order " << (i + 1) << std::endl;
-          abort();
-        }
-      }
-    }
-
-    void Enter(unsigned order_minus_1, void *data) {
-      NGram<BuildingPayload> gram(data, order_minus_1 + 1);
-      BuildingPayload &pay = gram.Value();
-      pay.complete.prob = pay.uninterp.prob + pay.uninterp.gamma * probs_[order_minus_1];
-      probs_[order_minus_1 + 1] = pay.complete.prob;
-
-      float out_backoff;
-      if (order_minus_1 < backoffs_.size() && *(gram.end() - 1) != specials_.UNK() && *(gram.end() - 1) != specials_.EOS() && backoffs_[order_minus_1]) {
-        if(prune_vocab_ || prune_thresholds_[order_minus_1 + 1] > 0) {
-          //Compute hash value for current context
-          uint64_t current_hash = util::MurmurHashNative(gram.begin(), gram.Order() * sizeof(WordIndex));
-
-          const HashGamma *hashed_backoff = static_cast<const HashGamma*>(backoffs_[order_minus_1].Get());
-          while(current_hash != hashed_backoff->hash_value && ++backoffs_[order_minus_1])
-            hashed_backoff = static_cast<const HashGamma*>(backoffs_[order_minus_1].Get());
-
-          if(current_hash == hashed_backoff->hash_value) {
-            out_backoff = hashed_backoff->gamma;
-            ++backoffs_[order_minus_1];
-          } else {
-            // Has been pruned away so it is not a context anymore
-            out_backoff = 1.0;
-          }
-        } else {
-          out_backoff = *static_cast<const float*>(backoffs_[order_minus_1].Get());
-          ++backoffs_[order_minus_1];
-        }
-      } else {
-        // Not a context.
-        out_backoff = 1.0;
-      }
-
-      output_.Gram(order_minus_1, out_backoff, pay.complete);
-    }
-
-    void Exit(unsigned, void *) const {}
-
-  private:
-    util::FixedArray<util::stream::Stream> backoffs_;
-
-    std::vector<float> probs_;
-    const std::vector<uint64_t>& prune_thresholds_;
-    bool prune_vocab_;
-
-    Output output_;
-    const SpecialVocab specials_;
-};
-} // namespace
-
-Interpolate::Interpolate(uint64_t vocab_size, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t>& prune_thresholds, bool prune_vocab, bool output_q, const SpecialVocab &specials)
-  : uniform_prob_(1.0 / static_cast<float>(vocab_size)), // Includes <unk> but excludes <s>.
-    backoffs_(backoffs),
-    prune_thresholds_(prune_thresholds),
-    prune_vocab_(prune_vocab),
-    output_q_(output_q),
-    specials_(specials) {}
-
-// perform order-wise interpolation
-void Interpolate::Run(const util::stream::ChainPositions &positions) {
-  assert(positions.size() == backoffs_.size() + 1);
-  if (output_q_) {
-    typedef Callback<OutputQ> C;
-    C callback(uniform_prob_, backoffs_, prune_thresholds_, prune_vocab_, specials_);
-    JointOrder<C, SuffixOrder>(positions, callback);
-  } else {
-    typedef Callback<OutputProbBackoff> C;
-    C callback(uniform_prob_, backoffs_, prune_thresholds_, prune_vocab_, specials_);
-    JointOrder<C, SuffixOrder>(positions, callback);
-  }
-}
-
-}} // namespaces

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/interpolate.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/builder/interpolate.hh b/ext/kenlm/lm/builder/interpolate.hh
deleted file mode 100644
index d20cd54..0000000
--- a/ext/kenlm/lm/builder/interpolate.hh
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef LM_BUILDER_INTERPOLATE_H
-#define LM_BUILDER_INTERPOLATE_H
-
-#include "lm/common/special.hh"
-#include "lm/word_index.hh"
-#include "util/stream/multi_stream.hh"
-
-#include <vector>
-
-#include <stdint.h>
-
-namespace lm { namespace builder {
-
-/* Interpolate step.
- * Input: suffix sorted n-grams with (p_uninterpolated, gamma) from
- * InitialProbabilities.
- * Output: suffix sorted n-grams with complete probability
- */
-class Interpolate {
-  public:
-    // Normally vocab_size is the unigram count-1 (since p(<s>) = 0) but might
-    // be larger when the user specifies a consistent vocabulary size.
-    explicit Interpolate(uint64_t vocab_size, const util::stream::ChainPositions &backoffs, const std::vector<uint64_t> &prune_thresholds, bool prune_vocab, bool output_q, const SpecialVocab &specials);
-
-    void Run(const util::stream::ChainPositions &positions);
-
-  private:
-    float uniform_prob_;
-    util::stream::ChainPositions backoffs_;
-    const std::vector<uint64_t> prune_thresholds_;
-    bool prune_vocab_;
-    bool output_q_;
-    const SpecialVocab specials_;
-};
-
-}} // namespaces
-#endif // LM_BUILDER_INTERPOLATE_H

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/lmplz_main.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/builder/lmplz_main.cc b/ext/kenlm/lm/builder/lmplz_main.cc
deleted file mode 100644
index cc3f381..0000000
--- a/ext/kenlm/lm/builder/lmplz_main.cc
+++ /dev/null
@@ -1,220 +0,0 @@
-#include "lm/builder/output.hh"
-#include "lm/builder/pipeline.hh"
-#include "lm/common/size_option.hh"
-#include "lm/lm_exception.hh"
-#include "util/file.hh"
-#include "util/file_piece.hh"
-#include "util/usage.hh"
-
-#include <iostream>
-
-#include <boost/program_options.hpp>
-#include <boost/version.hpp>
-#include <vector>
-
-namespace {
-
-// Parse and validate pruning thresholds then return vector of threshold counts
-// for each n-grams order.
-std::vector<uint64_t> ParsePruning(const std::vector<std::string> &param, std::size_t order) {
-  // convert to vector of integers
-  std::vector<uint64_t> prune_thresholds;
-  prune_thresholds.reserve(order);
-  for (std::vector<std::string>::const_iterator it(param.begin()); it != param.end(); ++it) {
-    try {
-      prune_thresholds.push_back(boost::lexical_cast<uint64_t>(*it));
-    } catch(const boost::bad_lexical_cast &) {
-      UTIL_THROW(util::Exception, "Bad pruning threshold " << *it);
-    }
-  }
-
-  // Fill with zeros by default.
-  if (prune_thresholds.empty()) {
-    prune_thresholds.resize(order, 0);
-    return prune_thresholds;
-  }
-
-  // validate pruning threshold if specified
-  // throw if each n-gram order has not  threshold specified
-  UTIL_THROW_IF(prune_thresholds.size() > order, util::Exception, "You specified pruning thresholds for orders 1 through " << prune_thresholds.size() << " but the model only has order " << order);
-  // threshold for unigram can only be 0 (no pruning)
-
-  // check if threshold are not in decreasing order
-  uint64_t lower_threshold = 0;
-  for (std::vector<uint64_t>::iterator it = prune_thresholds.begin(); it != prune_thresholds.end(); ++it) {
-    UTIL_THROW_IF(lower_threshold > *it, util::Exception, "Pruning thresholds should be in non-decreasing order.  Otherwise substrings would be removed, which is bad for query-time data structures.");
-    lower_threshold = *it;
-  }
-
-  // Pad to all orders using the last value.
-  prune_thresholds.resize(order, prune_thresholds.back());
-  return prune_thresholds;
-}
-
-lm::builder::Discount ParseDiscountFallback(const std::vector<std::string> &param) {
-  lm::builder::Discount ret;
-  UTIL_THROW_IF(param.size() > 3, util::Exception, "Specify at most three fallback discounts: 1, 2, and 3+");
-  UTIL_THROW_IF(param.empty(), util::Exception, "Fallback discounting enabled, but no discount specified");
-  ret.amount[0] = 0.0;
-  for (unsigned i = 0; i < 3; ++i) {
-    float discount = boost::lexical_cast<float>(param[i < param.size() ? i : (param.size() - 1)]);
-    UTIL_THROW_IF(discount < 0.0 || discount > static_cast<float>(i+1), util::Exception, "The discount for count " << (i+1) << " was parsed as " << discount << " which is not in the range [0, " << (i+1) << "].");
-    ret.amount[i + 1] = discount;
-  }
-  return ret;
-}
-
-} // namespace
-
-int main(int argc, char *argv[]) {
-  try {
-    namespace po = boost::program_options;
-    po::options_description options("Language model building options");
-    lm::builder::PipelineConfig pipeline;
-
-    std::string text, intermediate, arpa;
-    std::vector<std::string> pruning;
-    std::vector<std::string> discount_fallback;
-    std::vector<std::string> discount_fallback_default;
-    discount_fallback_default.push_back("0.5");
-    discount_fallback_default.push_back("1");
-    discount_fallback_default.push_back("1.5");
-    bool verbose_header;
-
-    options.add_options()
-      ("help,h", po::bool_switch(), "Show this help message")
-      ("order,o", po::value<std::size_t>(&pipeline.order)
-#if BOOST_VERSION >= 104200
-         ->required()
-#endif
-         , "Order of the model")
-      ("interpolate_unigrams", po::value<bool>(&pipeline.initial_probs.interpolate_unigrams)->default_value(true)->implicit_value(true), "Interpolate the unigrams (default) as opposed to giving lots of mass to <unk> like SRI.  If you want SRI's behavior with a large <unk> and the old lmplz default, use --interpolate_unigrams 0.")
-      ("skip_symbols", po::bool_switch(), "Treat <s>, </s>, and <unk> as whitespace instead of throwing an exception")
-      ("temp_prefix,T", po::value<std::string>(&pipeline.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix")
-      ("memory,S", lm:: SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory")
-      ("minimum_block", lm::SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow")
-      ("sort_block", lm::SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)")
-      ("block_count", po::value<std::size_t>(&pipeline.block_count)->default_value(2), "Block count (per order)")
-      ("vocab_estimate", po::value<lm::WordIndex>(&pipeline.vocab_estimate)->default_value(1000000), "Assume this vocabulary size for purposes of calculating memory in step 1 (corpus count) and pre-sizing the hash table")
-      ("vocab_pad", po::value<uint64_t>(&pipeline.vocab_size_for_unk)->default_value(0), "If the vocabulary is smaller than this value, pad with <unk> to reach this size. Requires --interpolate_unigrams")
-      ("verbose_header", po::bool_switch(&verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.")
-      ("text", po::value<std::string>(&text), "Read text from a file instead of stdin")
-      ("arpa", po::value<std::string>(&arpa), "Write ARPA to a file instead of stdout")
-      ("intermediate", po::value<std::string>(&intermediate), "Write ngrams to intermediate files.  Turns off ARPA output (which can be reactivated by --arpa file).  Forces --renumber on.")
-      ("renumber", po::bool_switch(&pipeline.renumber_vocabulary), "Rrenumber the vocabulary identifiers so that they are monotone with the hash of each string.  This is consistent with the ordering used by the trie data structure.")
-      ("collapse_values", po::bool_switch(&pipeline.output_q), "Collapse probability and backoff into a single value, q that yields the same sentence-level probabilities.  See http://kheafield.com/professional/edinburgh/rest_paper.pdf for more details, including a proof.")
-      ("prune", po::value<std::vector<std::string> >(&pruning)->multitoken(), "Prune n-grams with count less than or equal to the given threshold.  Specify one value for each order i.e. 0 0 1 to prune singleton trigrams and above.  The sequence of values must be non-decreasing and the last value applies to any remaining orders. Default is to not prune, which is equivalent to --prune 0.")
-      ("limit_vocab_file", po::value<std::string>(&pipeline.prune_vocab_file)->default_value(""), "Read allowed vocabulary separated by whitespace. N-grams that contain vocabulary items not in this list will be pruned. Can be combined with --prune arg")
-      ("discount_fallback", po::value<std::vector<std::string> >(&discount_fallback)->multitoken()->implicit_value(discount_fallback_default, "0.5 1 1.5"), "The closed-form estimate for Kneser-Ney discounts does not work without singletons or doubletons.  It can also fail if these values are out of range.  This option falls back to user-specified discounts when the closed-form estimate fails.  Note that this option is generally a bad idea: you should deduplicate your corpus instead.  However, class-based models need custom discounts because they lack singleton unigrams.  Provide up to three discounts (for adjusted counts 1, 2, and 3+), which will be applied to all orders where the closed-form estimates fail.");
-    po::variables_map vm;
-    po::store(po::parse_command_line(argc, argv, options), vm);
-
-    if (argc == 1 || vm["help"].as<bool>()) {
-      std::cerr <<
-        "Builds unpruned language models with modified Kneser-Ney smoothing.\n\n"
-        "Please cite:\n"
-        "@inproceedings{Heafield-estimate,\n"
-        "  author = {Kenneth Heafield and Ivan Pouzyrevsky and Jonathan H. Clark and Philipp Koehn},\n"
-        "  title = {Scalable Modified {Kneser-Ney} Language Model Estimation},\n"
-        "  year = {2013},\n"
-        "  month = {8},\n"
-        "  booktitle = {Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics},\n"
-        "  address = {Sofia, Bulgaria},\n"
-        "  url = {http://kheafield.com/professional/edinburgh/estimate\\_paper.pdf},\n"
-        "}\n\n"
-        "Provide the corpus on stdin.  The ARPA file will be written to stdout.  Order of\n"
-        "the model (-o) is the only mandatory option.  As this is an on-disk program,\n"
-        "setting the temporary file location (-T) and sorting memory (-S) is recommended.\n\n"
-        "Memory sizes are specified like GNU sort: a number followed by a unit character.\n"
-        "Valid units are \% for percentage of memory (supported platforms only) and (in\n"
-        "increasing powers of 1024): b, K, M, G, T, P, E, Z, Y.  Default is K (*1024).\n";
-      uint64_t mem = util::GuessPhysicalMemory();
-      if (mem) {
-        std::cerr << "This machine has " << mem << " bytes of memory.\n\n";
-      } else {
-        std::cerr << "Unable to determine the amount of memory on this machine.\n\n";
-      }
-      std::cerr << options << std::endl;
-      return 1;
-    }
-
-    po::notify(vm);
-
-    // required() appeared in Boost 1.42.0.
-#if BOOST_VERSION < 104200
-    if (!vm.count("order")) {
-      std::cerr << "the option '--order' is required but missing" << std::endl;
-      return 1;
-    }
-#endif
-
-    if (pipeline.vocab_size_for_unk && !pipeline.initial_probs.interpolate_unigrams) {
-      std::cerr << "--vocab_pad requires --interpolate_unigrams be on" << std::endl;
-      return 1;
-    }
-
-    if (vm["skip_symbols"].as<bool>()) {
-      pipeline.disallowed_symbol_action = lm::COMPLAIN;
-    } else {
-      pipeline.disallowed_symbol_action = lm::THROW_UP;
-    }
-
-    if (vm.count("discount_fallback")) {
-      pipeline.discount.fallback = ParseDiscountFallback(discount_fallback);
-      pipeline.discount.bad_action = lm::COMPLAIN;
-    } else {
-      // Unused, just here to prevent the compiler from complaining about uninitialized.
-      pipeline.discount.fallback = lm::builder::Discount();
-      pipeline.discount.bad_action = lm::THROW_UP;
-    }
-
-    // parse pruning thresholds.  These depend on order, so it is not done as a notifier.
-    pipeline.prune_thresholds = ParsePruning(pruning, pipeline.order);
-
-    if (!vm["limit_vocab_file"].as<std::string>().empty()) {
-      pipeline.prune_vocab = true;
-    }
-    else {
-      pipeline.prune_vocab = false;
-    }
-
-    util::NormalizeTempPrefix(pipeline.sort.temp_prefix);
-
-    lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs;
-    // TODO: evaluate options for these.
-    initial.adder_in.total_memory = 32768;
-    initial.adder_in.block_count = 2;
-    initial.adder_out.total_memory = 32768;
-    initial.adder_out.block_count = 2;
-    pipeline.read_backoffs = initial.adder_out;
-
-    // Read from stdin, write to stdout by default
-    util::scoped_fd in(0), out(1);
-    if (vm.count("text")) {
-      in.reset(util::OpenReadOrThrow(text.c_str()));
-    }
-    if (vm.count("arpa")) {
-      out.reset(util::CreateOrThrow(arpa.c_str()));
-    }
-
-    try {
-      bool writing_intermediate = vm.count("intermediate");
-      if (writing_intermediate) {
-        pipeline.renumber_vocabulary = true;
-      }
-      lm::builder::Output output(writing_intermediate ? intermediate : pipeline.sort.temp_prefix, writing_intermediate, pipeline.output_q);
-      if (!writing_intermediate || vm.count("arpa")) {
-        output.Add(new lm::builder::PrintHook(out.release(), verbose_header));
-      }
-      lm::builder::Pipeline(pipeline, in.release(), output);
-    } catch (const util::MallocException &e) {
-      std::cerr << e.what() << std::endl;
-      std::cerr << "Try rerunning with a more conservative -S setting than " << vm["memory"].as<std::string>() << std::endl;
-      return 1;
-    }
-    util::PrintUsage(std::cerr);
-  } catch (const std::exception &e) {
-    std::cerr << e.what() << std::endl;
-    return 1;
-  }
-}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/output.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/builder/output.cc b/ext/kenlm/lm/builder/output.cc
deleted file mode 100644
index 604fa22..0000000
--- a/ext/kenlm/lm/builder/output.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-#include "lm/builder/output.hh"
-
-#include "lm/common/model_buffer.hh"
-#include "lm/common/print.hh"
-#include "util/file_stream.hh"
-#include "util/stream/multi_stream.hh"
-
-#include <iostream>
-
-namespace lm { namespace builder {
-
-OutputHook::~OutputHook() {}
-
-Output::Output(StringPiece file_base, bool keep_buffer, bool output_q)
-  : buffer_(file_base, keep_buffer, output_q) {}
-
-void Output::SinkProbs(util::stream::Chains &chains) {
-  Apply(PROB_PARALLEL_HOOK, chains);
-  if (!buffer_.Keep() && !Have(PROB_SEQUENTIAL_HOOK)) {
-    chains >> util::stream::kRecycle;
-    chains.Wait(true);
-    return;
-  }
-  buffer_.Sink(chains, header_.counts_pruned);
-  chains >> util::stream::kRecycle;
-  chains.Wait(false);
-  if (Have(PROB_SEQUENTIAL_HOOK)) {
-    std::cerr << "=== 5/5 Writing ARPA model ===" << std::endl;
-    buffer_.Source(chains);
-    Apply(PROB_SEQUENTIAL_HOOK, chains);
-    chains >> util::stream::kRecycle;
-    chains.Wait(true);
-  }
-}
-
-void Output::Apply(HookType hook_type, util::stream::Chains &chains) {
-  for (boost::ptr_vector<OutputHook>::iterator entry = outputs_[hook_type].begin(); entry != outputs_[hook_type].end(); ++entry) {
-    entry->Sink(header_, VocabFile(), chains);
-  }
-}
-
-void PrintHook::Sink(const HeaderInfo &info, int vocab_file, util::stream::Chains &chains) {
-  if (verbose_header_) {
-    util::FileStream out(file_.get(), 50);
-    out << "# Input file: " << info.input_file << '\n';
-    out << "# Token count: " << info.token_count << '\n';
-    out << "# Smoothing: Modified Kneser-Ney" << '\n';
-  }
-  chains >> PrintARPA(vocab_file, file_.get(), info.counts_pruned);
-}
-
-}} // namespaces

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/output.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/builder/output.hh b/ext/kenlm/lm/builder/output.hh
deleted file mode 100644
index 69d6c6d..0000000
--- a/ext/kenlm/lm/builder/output.hh
+++ /dev/null
@@ -1,85 +0,0 @@
-#ifndef LM_BUILDER_OUTPUT_H
-#define LM_BUILDER_OUTPUT_H
-
-#include "lm/builder/header_info.hh"
-#include "lm/common/model_buffer.hh"
-#include "util/file.hh"
-
-#include <boost/ptr_container/ptr_vector.hpp>
-#include <boost/utility.hpp>
-
-namespace util { namespace stream { class Chains; class ChainPositions; } }
-
-/* Outputs from lmplz: ARPA, sharded files, etc */
-namespace lm { namespace builder {
-
-// These are different types of hooks.  Values should be consecutive to enable a vector lookup.
-enum HookType {
-  // TODO: counts.
-  PROB_PARALLEL_HOOK, // Probability and backoff (or just q).  Output must process the orders in parallel or there will be a deadlock.
-  PROB_SEQUENTIAL_HOOK, // Probability and backoff (or just q).  Output can process orders any way it likes.  This requires writing the data to disk then reading.  Useful for ARPA files, which put unigrams first etc.
-  NUMBER_OF_HOOKS // Keep this last so we know how many values there are.
-};
-
-class OutputHook {
-  public:
-    explicit OutputHook(HookType hook_type) : type_(hook_type) {}
-
-    virtual ~OutputHook();
-
-    virtual void Sink(const HeaderInfo &info, int vocab_file, util::stream::Chains &chains) = 0;
-
-    HookType Type() const { return type_; }
-
-  private:
-    HookType type_;
-};
-
-class Output : boost::noncopyable {
-  public:
-    Output(StringPiece file_base, bool keep_buffer, bool output_q);
-
-    // Takes ownership.
-    void Add(OutputHook *hook) {
-      outputs_[hook->Type()].push_back(hook);
-    }
-
-    bool Have(HookType hook_type) const {
-      return !outputs_[hook_type].empty();
-    }
-
-    int VocabFile() const { return buffer_.VocabFile(); }
-
-    void SetHeader(const HeaderInfo &header) { header_ = header; }
-    const HeaderInfo &GetHeader() const { return header_; }
-
-    // This is called by the pipeline.
-    void SinkProbs(util::stream::Chains &chains);
-
-    unsigned int Steps() const { return Have(PROB_SEQUENTIAL_HOOK); }
-
-  private:
-    void Apply(HookType hook_type, util::stream::Chains &chains);
-
-    ModelBuffer buffer_;
-
-    boost::ptr_vector<OutputHook> outputs_[NUMBER_OF_HOOKS];
-    HeaderInfo header_;
-};
-
-class PrintHook : public OutputHook {
-  public:
-    // Takes ownership
-    PrintHook(int write_fd, bool verbose_header)
-      : OutputHook(PROB_SEQUENTIAL_HOOK), file_(write_fd), verbose_header_(verbose_header) {}
-
-    void Sink(const HeaderInfo &info, int vocab_file, util::stream::Chains &chains);
-
-  private:
-    util::scoped_fd file_;
-    bool verbose_header_;
-};
-
-}} // namespaces
-
-#endif // LM_BUILDER_OUTPUT_H

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/payload.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/builder/payload.hh b/ext/kenlm/lm/builder/payload.hh
deleted file mode 100644
index ba12725..0000000
--- a/ext/kenlm/lm/builder/payload.hh
+++ /dev/null
@@ -1,48 +0,0 @@
-#ifndef LM_BUILDER_PAYLOAD_H
-#define LM_BUILDER_PAYLOAD_H
-
-#include "lm/weights.hh"
-#include "lm/word_index.hh"
-#include <stdint.h>
-
-namespace lm { namespace builder {
-
-struct Uninterpolated {
-  float prob;  // Uninterpolated probability.
-  float gamma; // Interpolation weight for lower order.
-};
-
-union BuildingPayload {
-  uint64_t count;
-  Uninterpolated uninterp;
-  ProbBackoff complete;
-
-  /*mjd**********************************************************************/
-  bool IsMarked() const {
-    return count >> (sizeof(count) * 8 - 1);
-  }
-
-  void Mark() {
-    count |= (1ul << (sizeof(count) * 8 - 1));
-  }
-
-  void Unmark() {
-    count &= ~(1ul << (sizeof(count) * 8 - 1));
-  }
-
-  uint64_t UnmarkedCount() const {
-    return count & ~(1ul << (sizeof(count) * 8 - 1));
-  }
-
-  uint64_t CutoffCount() const {
-    return IsMarked() ? 0 : UnmarkedCount();
-  }
-  /*mjd**********************************************************************/
-};
-
-const WordIndex kBOS = 1;
-const WordIndex kEOS = 2;
-
-}} // namespaces
-
-#endif // LM_BUILDER_PAYLOAD_H

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/pipeline.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/builder/pipeline.cc b/ext/kenlm/lm/builder/pipeline.cc
deleted file mode 100644
index 64e30f7..0000000
--- a/ext/kenlm/lm/builder/pipeline.cc
+++ /dev/null
@@ -1,385 +0,0 @@
-#include "lm/builder/pipeline.hh"
-
-#include "lm/builder/adjust_counts.hh"
-#include "lm/builder/combine_counts.hh"
-#include "lm/builder/corpus_count.hh"
-#include "lm/builder/hash_gamma.hh"
-#include "lm/builder/initial_probabilities.hh"
-#include "lm/builder/interpolate.hh"
-#include "lm/builder/output.hh"
-#include "lm/common/compare.hh"
-#include "lm/common/renumber.hh"
-
-#include "lm/sizes.hh"
-#include "lm/vocab.hh"
-
-#include "util/exception.hh"
-#include "util/file.hh"
-#include "util/stream/io.hh"
-
-#include <algorithm>
-#include <iostream>
-#include <fstream>
-#include <vector>
-
-namespace lm { namespace builder {
-
-using util::stream::Sorts;
-
-namespace {
-
-void PrintStatistics(const std::vector<uint64_t> &counts, const std::vector<uint64_t> &counts_pruned, const std::vector<Discount> &discounts) {
-  std::cerr << "Statistics:\n";
-  for (size_t i = 0; i < counts.size(); ++i) {
-    std::cerr << (i + 1) << ' ' << counts_pruned[i];
-    if(counts[i] != counts_pruned[i])
-       std::cerr << "/" << counts[i];
-
-    for (size_t d = 1; d <= 3; ++d)
-      std::cerr << " D" << d << (d == 3 ? "+=" : "=") << discounts[i].amount[d];
-    std::cerr << '\n';
-  }
-}
-
-class Master {
-  public:
-    explicit Master(PipelineConfig &config, unsigned output_steps)
-      : config_(config), chains_(config.order), unigrams_(util::MakeTemp(config_.TempPrefix())), steps_(output_steps + 4) {
-      config_.minimum_block = std::max(NGram<BuildingPayload>::TotalSize(config_.order), config_.minimum_block);
-    }
-
-    const PipelineConfig &Config() const { return config_; }
-
-    util::stream::Chains &MutableChains() { return chains_; }
-
-    template <class T> Master &operator>>(const T &worker) {
-      chains_ >> worker;
-      return *this;
-    }
-
-    // This takes the (partially) sorted ngrams and sets up for adjusted counts.
-    void InitForAdjust(util::stream::Sort<SuffixOrder, CombineCounts> &ngrams, WordIndex types, std::size_t subtract_for_numbering) {
-      const std::size_t each_order_min = config_.minimum_block * config_.block_count;
-      // We know how many unigrams there are.  Don't allocate more than needed to them.
-      const std::size_t min_chains = (config_.order - 1) * each_order_min +
-        std::min(types * NGram<BuildingPayload>::TotalSize(1), each_order_min);
-      // Prevent overflow in subtracting.
-      const std::size_t total = std::max<std::size_t>(config_.TotalMemory(), min_chains + subtract_for_numbering + config_.minimum_block);
-      // Do merge sort with calculated laziness.
-      const std::size_t merge_using = ngrams.Merge(std::min(total - min_chains - subtract_for_numbering, ngrams.DefaultLazy()));
-
-      std::vector<uint64_t> count_bounds(1, types);
-      CreateChains(total - merge_using - subtract_for_numbering, count_bounds);
-      ngrams.Output(chains_.back(), merge_using);
-    }
-
-    // For initial probabilities, but this is generic.
-    void SortAndReadTwice(const std::vector<uint64_t> &counts, Sorts<ContextOrder> &sorts, util::stream::Chains &second, util::stream::ChainConfig second_config) {
-      bool unigrams_are_sorted = !config_.renumber_vocabulary;
-      // Do merge first before allocating chain memory.
-      for (std::size_t i = 0; i < config_.order - unigrams_are_sorted; ++i) {
-        sorts[i].Merge(0);
-      }
-      // There's no lazy merge, so just divide memory amongst the chains.
-      CreateChains(config_.TotalMemory(), counts);
-      chains_.back().ActivateProgress();
-      if (unigrams_are_sorted) {
-        chains_[0] >> unigrams_.Source();
-        second_config.entry_size = NGram<BuildingPayload>::TotalSize(1);
-        second.push_back(second_config);
-        second.back() >> unigrams_.Source();
-      }
-      for (std::size_t i = unigrams_are_sorted; i < config_.order; ++i) {
-        util::scoped_fd fd(sorts[i - unigrams_are_sorted].StealCompleted());
-        chains_[i].SetProgressTarget(util::SizeOrThrow(fd.get()));
-        chains_[i] >> util::stream::PRead(util::DupOrThrow(fd.get()), true);
-        second_config.entry_size = NGram<BuildingPayload>::TotalSize(i + 1);
-        second.push_back(second_config);
-        second.back() >> util::stream::PRead(fd.release(), true);
-      }
-    }
-
-    // There is no sort after this, so go for broke on lazy merging.
-    template <class Compare> void MaximumLazyInput(const std::vector<uint64_t> &counts, Sorts<Compare> &sorts) {
-      // Determine the minimum we can use for all the chains.
-      std::size_t min_chains = 0;
-      for (std::size_t i = 0; i < config_.order; ++i) {
-        min_chains += std::min(counts[i] * NGram<BuildingPayload>::TotalSize(i + 1), static_cast<uint64_t>(config_.minimum_block));
-      }
-      std::size_t for_merge = min_chains > config_.TotalMemory() ? 0 : (config_.TotalMemory() - min_chains);
-      std::vector<std::size_t> laziness;
-      // Prioritize longer n-grams.
-      for (util::stream::Sort<SuffixOrder> *i = sorts.end() - 1; i >= sorts.begin(); --i) {
-        laziness.push_back(i->Merge(for_merge));
-        assert(for_merge >= laziness.back());
-        for_merge -= laziness.back();
-      }
-      std::reverse(laziness.begin(), laziness.end());
-
-      CreateChains(for_merge + min_chains, counts);
-      chains_.back().ActivateProgress();
-      chains_[0] >> unigrams_.Source();
-      for (std::size_t i = 1; i < config_.order; ++i) {
-        sorts[i - 1].Output(chains_[i], laziness[i - 1]);
-      }
-    }
-
-    template <class Compare> void SetupSorts(Sorts<Compare> &sorts, bool exclude_unigrams) {
-      sorts.Init(config_.order - exclude_unigrams);
-      // Unigrams don't get sorted because their order is always the same.
-      if (exclude_unigrams) chains_[0] >> unigrams_.Sink();
-      for (std::size_t i = exclude_unigrams; i < config_.order; ++i) {
-        sorts.push_back(chains_[i], config_.sort, Compare(i + 1));
-      }
-      chains_.Wait(true);
-    }
-
-    unsigned int Steps() const { return steps_; }
-
-  private:
-    // Create chains, allocating memory to them.  Totally heuristic.  Count
-    // bounds are upper bounds on the counts or not present.
-    void CreateChains(std::size_t remaining_mem, const std::vector<uint64_t> &count_bounds) {
-      std::vector<std::size_t> assignments;
-      assignments.reserve(config_.order);
-      // Start by assigning maximum memory usage (to be refined later).
-      for (std::size_t i = 0; i < count_bounds.size(); ++i) {
-        assignments.push_back(static_cast<std::size_t>(std::min(
-            static_cast<uint64_t>(remaining_mem),
-            count_bounds[i] * static_cast<uint64_t>(NGram<BuildingPayload>::TotalSize(i + 1)))));
-      }
-      assignments.resize(config_.order, remaining_mem);
-
-      // Now we know how much memory everybody wants.  How much will they get?
-      // Proportional to this.
-      std::vector<float> portions;
-      // Indices of orders that have yet to be assigned.
-      std::vector<std::size_t> unassigned;
-      for (std::size_t i = 0; i < config_.order; ++i) {
-        portions.push_back(static_cast<float>((i+1) * NGram<BuildingPayload>::TotalSize(i+1)));
-        unassigned.push_back(i);
-      }
-      /*If somebody doesn't eat their full dinner, give it to the rest of the
-       * family.  Then somebody else might not eat their full dinner etc.  Ends
-       * when everybody unassigned is hungry.
-       */
-      float sum;
-      bool found_more;
-      std::vector<std::size_t> block_count(config_.order);
-      do {
-        sum = 0.0;
-        for (std::size_t i = 0; i < unassigned.size(); ++i) {
-          sum += portions[unassigned[i]];
-        }
-        found_more = false;
-        // If the proportional assignment is more than needed, give it just what it needs.
-        for (std::vector<std::size_t>::iterator i = unassigned.begin(); i != unassigned.end();) {
-          if (assignments[*i] <= remaining_mem * (portions[*i] / sum)) {
-            remaining_mem -= assignments[*i];
-            block_count[*i] = 1;
-            i = unassigned.erase(i);
-            found_more = true;
-          } else {
-            ++i;
-          }
-        }
-      } while (found_more);
-      for (std::vector<std::size_t>::iterator i = unassigned.begin(); i != unassigned.end(); ++i) {
-        assignments[*i] = remaining_mem * (portions[*i] / sum);
-        block_count[*i] = config_.block_count;
-      }
-      chains_.clear();
-      std::cerr << "Chain sizes:";
-      for (std::size_t i = 0; i < config_.order; ++i) {
-        // Always have enough for at least one record.
-        // This was crashing if e.g. there was no 5-gram.
-        assignments[i] = std::max(assignments[i], block_count[i] * NGram<BuildingPayload>::TotalSize(i + 1));
-        std::cerr << ' ' << (i+1) << ":" << assignments[i];
-        chains_.push_back(util::stream::ChainConfig(NGram<BuildingPayload>::TotalSize(i + 1), block_count[i], assignments[i]));
-      }
-      std::cerr << std::endl;
-    }
-
-    PipelineConfig &config_;
-
-    util::stream::Chains chains_;
-
-    util::stream::FileBuffer unigrams_;
-
-    const unsigned int steps_;
-};
-
-util::stream::Sort<SuffixOrder, CombineCounts> *CountText(int text_file /* input */, int vocab_file /* output */, Master &master, uint64_t &token_count, WordIndex &type_count, std::string &text_file_name, std::vector<bool> &prune_words) {
-  const PipelineConfig &config = master.Config();
-  std::cerr << "=== 1/" << master.Steps() << " Counting and sorting n-grams ===" << std::endl;
-
-  const std::size_t vocab_usage = CorpusCount::VocabUsage(config.vocab_estimate);
-  UTIL_THROW_IF(config.TotalMemory() < vocab_usage, util::Exception, "Vocab hash size estimate " << vocab_usage << " exceeds total memory " << config.TotalMemory());
-  std::size_t memory_for_chain =
-    // This much memory to work with after vocab hash table.
-    static_cast<float>(config.TotalMemory() - vocab_usage) /
-    // Solve for block size including the dedupe multiplier for one block.
-    (static_cast<float>(config.block_count) + CorpusCount::DedupeMultiplier(config.order)) *
-    // Chain likes memory expressed in terms of total memory.
-    static_cast<float>(config.block_count);
-  util::stream::Chain chain(util::stream::ChainConfig(NGram<BuildingPayload>::TotalSize(config.order), config.block_count, memory_for_chain));
-
-  type_count = config.vocab_estimate;
-  util::FilePiece text(text_file, NULL, &std::cerr);
-  text_file_name = text.FileName();
-  CorpusCount counter(text, vocab_file, token_count, type_count, prune_words, config.prune_vocab_file, chain.BlockSize() / chain.EntrySize(), config.disallowed_symbol_action);
-  chain >> boost::ref(counter);
-
-  util::scoped_ptr<util::stream::Sort<SuffixOrder, CombineCounts> > sorter(new util::stream::Sort<SuffixOrder, CombineCounts>(chain, config.sort, SuffixOrder(config.order), CombineCounts()));
-  chain.Wait(true);
-  return sorter.release();
-}
-
-void InitialProbabilities(const std::vector<uint64_t> &counts, const std::vector<uint64_t> &counts_pruned, const std::vector<Discount> &discounts, Master &master, Sorts<SuffixOrder> &primary, util::FixedArray<util::stream::FileBuffer> &gammas, const std::vector<uint64_t> &prune_thresholds, bool prune_vocab, const SpecialVocab &specials) {
-  const PipelineConfig &config = master.Config();
-  util::stream::Chains second(config.order);
-
-  {
-    Sorts<ContextOrder> sorts;
-    master.SetupSorts(sorts, !config.renumber_vocabulary);
-    PrintStatistics(counts, counts_pruned, discounts);
-    lm::ngram::ShowSizes(counts_pruned);
-    std::cerr << "=== 3/" << master.Steps() << " Calculating and sorting initial probabilities ===" << std::endl;
-    master.SortAndReadTwice(counts_pruned, sorts, second, config.initial_probs.adder_in);
-  }
-
-  util::stream::Chains gamma_chains(config.order);
-  InitialProbabilities(config.initial_probs, discounts, master.MutableChains(), second, gamma_chains, prune_thresholds, prune_vocab, specials);
-  // Don't care about gamma for 0.
-  gamma_chains[0] >> util::stream::kRecycle;
-  gammas.Init(config.order - 1);
-  for (std::size_t i = 1; i < config.order; ++i) {
-    gammas.push_back(util::MakeTemp(config.TempPrefix()));
-    gamma_chains[i] >> gammas[i - 1].Sink();
-  }
-  // Has to be done here due to gamma_chains scope.
-  master.SetupSorts(primary, true);
-}
-
-void InterpolateProbabilities(const std::vector<uint64_t> &counts, Master &master, Sorts<SuffixOrder> &primary, util::FixedArray<util::stream::FileBuffer> &gammas, Output &output, const SpecialVocab &specials) {
-  std::cerr << "=== 4/" << master.Steps() << " Calculating and writing order-interpolated probabilities ===" << std::endl;
-  const PipelineConfig &config = master.Config();
-  master.MaximumLazyInput(counts, primary);
-
-  util::stream::Chains gamma_chains(config.order - 1);
-  for (std::size_t i = 0; i < config.order - 1; ++i) {
-    util::stream::ChainConfig read_backoffs(config.read_backoffs);
-
-    if(config.prune_vocab || config.prune_thresholds[i + 1] > 0)
-        read_backoffs.entry_size = sizeof(HashGamma);
-    else
-        read_backoffs.entry_size = sizeof(float);
-
-    gamma_chains.push_back(read_backoffs);
-    gamma_chains.back() >> gammas[i].Source(true);
-  }
-  master >> Interpolate(std::max(master.Config().vocab_size_for_unk, counts[0] - 1 /* <s> is not included */), util::stream::ChainPositions(gamma_chains), config.prune_thresholds, config.prune_vocab, config.output_q, specials);
-  gamma_chains >> util::stream::kRecycle;
-  output.SinkProbs(master.MutableChains());
-}
-
-class VocabNumbering {
-  public:
-    VocabNumbering(int final_vocab, StringPiece temp_prefix, bool renumber)
-      : final_vocab_(final_vocab),
-        renumber_(renumber),
-        specials_(kBOS, kEOS) {
-      if (renumber) {
-        temporary_.reset(util::MakeTemp(temp_prefix));
-      }
-    }
-
-    int WriteOnTheFly() const { return renumber_ ? temporary_.get() : final_vocab_; }
-
-    // Compute the vocabulary mapping and return the memory used.
-    std::size_t ComputeMapping(WordIndex type_count) {
-      if (!renumber_) return 0;
-      ngram::SortedVocabulary::ComputeRenumbering(type_count, temporary_.get(), final_vocab_, vocab_mapping_);
-      temporary_.reset();
-      return sizeof(WordIndex) * vocab_mapping_.size();
-    }
-
-    void ApplyRenumber(util::stream::Chains &chains) {
-      if (!renumber_) return;
-      for (std::size_t i = 0; i < chains.size(); ++i) {
-        chains[i] >> Renumber(&*vocab_mapping_.begin(), i + 1);
-      }
-      specials_ = SpecialVocab(vocab_mapping_[specials_.BOS()], vocab_mapping_[specials_.EOS()]);
-    }
-
-    const SpecialVocab &Specials() const { return specials_; }
-
-  private:
-    int final_vocab_;
-    // Out of order vocab file created on the fly.
-    util::scoped_fd temporary_;
-
-    bool renumber_;
-
-    std::vector<WordIndex> vocab_mapping_;
-
-    SpecialVocab specials_;
-};
-
-} // namespace
-
-void Pipeline(PipelineConfig &config, int text_file, Output &output) {
-  // Some fail-fast sanity checks.
-  if (config.sort.buffer_size * 4 > config.TotalMemory()) {
-    config.sort.buffer_size = config.TotalMemory() / 4;
-    std::cerr << "Warning: changing sort block size to " << config.sort.buffer_size << " bytes due to low total memory." << std::endl;
-  }
-  if (config.minimum_block < NGram<BuildingPayload>::TotalSize(config.order)) {
-    config.minimum_block = NGram<BuildingPayload>::TotalSize(config.order);
-    std::cerr << "Warning: raising minimum block to " << config.minimum_block << " to fit an ngram in every block." << std::endl;
-  }
-  UTIL_THROW_IF(config.sort.buffer_size < config.minimum_block, util::Exception, "Sort block size " << config.sort.buffer_size << " is below the minimum block size " << config.minimum_block << ".");
-  UTIL_THROW_IF(config.TotalMemory() < config.minimum_block * config.order * config.block_count, util::Exception,
-      "Not enough memory to fit " << (config.order * config.block_count) << " blocks with minimum size " << config.minimum_block << ".  Increase memory to " << (config.minimum_block * config.order * config.block_count) << " bytes or decrease the minimum block size.");
-
-  Master master(config, output.Steps());
-  // master's destructor will wait for chains.  But they might be deadlocked if
-  // this thread dies because e.g. it ran out of memory.
-  try {
-    VocabNumbering numbering(output.VocabFile(), config.TempPrefix(), config.renumber_vocabulary);
-    uint64_t token_count;
-    WordIndex type_count;
-    std::string text_file_name;
-    std::vector<bool> prune_words;
-    util::scoped_ptr<util::stream::Sort<SuffixOrder, CombineCounts> > sorted_counts(
-        CountText(text_file, numbering.WriteOnTheFly(), master, token_count, type_count, text_file_name, prune_words));
-    std::cerr << "Unigram tokens " << token_count << " types " << type_count << std::endl;
-
-    // Create vocab mapping, which uses temporary memory, while nothing else is happening.
-    std::size_t subtract_for_numbering = numbering.ComputeMapping(type_count);
-
-    std::cerr << "=== 2/" << master.Steps() << " Calculating and sorting adjusted counts ===" << std::endl;
-    master.InitForAdjust(*sorted_counts, type_count, subtract_for_numbering);
-    sorted_counts.reset();
-
-    std::vector<uint64_t> counts;
-    std::vector<uint64_t> counts_pruned;
-    std::vector<Discount> discounts;
-    master >> AdjustCounts(config.prune_thresholds, counts, counts_pruned, prune_words, config.discount, discounts);
-    numbering.ApplyRenumber(master.MutableChains());
-
-    {
-      util::FixedArray<util::stream::FileBuffer> gammas;
-      Sorts<SuffixOrder> primary;
-      InitialProbabilities(counts, counts_pruned, discounts, master, primary, gammas, config.prune_thresholds, config.prune_vocab, numbering.Specials());
-      output.SetHeader(HeaderInfo(text_file_name, token_count, counts_pruned));
-      // Also does output.
-      InterpolateProbabilities(counts_pruned, master, primary, gammas, output, numbering.Specials());
-    }
-  } catch (const util::Exception &e) {
-    std::cerr << e.what() << std::endl;
-    abort();
-  }
-}
-
-}} // namespaces

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/pipeline.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/builder/pipeline.hh b/ext/kenlm/lm/builder/pipeline.hh
deleted file mode 100644
index 66f1fd9..0000000
--- a/ext/kenlm/lm/builder/pipeline.hh
+++ /dev/null
@@ -1,76 +0,0 @@
-#ifndef LM_BUILDER_PIPELINE_H
-#define LM_BUILDER_PIPELINE_H
-
-#include "lm/builder/adjust_counts.hh"
-#include "lm/builder/initial_probabilities.hh"
-#include "lm/builder/header_info.hh"
-#include "lm/lm_exception.hh"
-#include "lm/word_index.hh"
-#include "util/stream/config.hh"
-#include "util/file_piece.hh"
-
-#include <string>
-#include <cstddef>
-
-namespace lm { namespace builder {
-
-class Output;
-
-struct PipelineConfig {
-  std::size_t order;
-  util::stream::SortConfig sort;
-  InitialProbabilitiesConfig initial_probs;
-  util::stream::ChainConfig read_backoffs;
-
-  // Estimated vocabulary size.  Used for sizing CorpusCount memory and
-  // initial probing hash table sizing, also in CorpusCount.
-  lm::WordIndex vocab_estimate;
-
-  // Minimum block size to tolerate.
-  std::size_t minimum_block;
-
-  // Number of blocks to use.  This will be overridden to 1 if everything fits.
-  std::size_t block_count;
-
-  // n-gram count thresholds for pruning. 0 values means no pruning for
-  // corresponding n-gram order
-  std::vector<uint64_t> prune_thresholds; //mjd
-  bool prune_vocab;
-  std::string prune_vocab_file;
-
-  /* Renumber the vocabulary the way the trie likes it? */
-  bool renumber_vocabulary;
-
-  // What to do with discount failures.
-  DiscountConfig discount;
-
-  // Compute collapsed q values instead of probability and backoff
-  bool output_q;
-
-  /* Computing the perplexity of LMs with different vocabularies is hard.  For
-   * example, the lowest perplexity is attained by a unigram model that
-   * predicts p(<unk>) = 1 and has no other vocabulary.  Also, linearly
-   * interpolated models will sum to more than 1 because <unk> is duplicated
-   * (SRI just pretends p(<unk>) = 0 for these purposes, which makes it sum to
-   * 1 but comes with its own problems).  This option will make the vocabulary
-   * a particular size by replicating <unk> multiple times for purposes of
-   * computing vocabulary size.  It has no effect if the actual vocabulary is
-   * larger.  This parameter serves the same purpose as IRSTLM's "dub".
-   */
-  uint64_t vocab_size_for_unk;
-
-  /* What to do the first time <s>, </s>, or <unk> appears in the input.  If
-   * this is anything but THROW_UP, then the symbol will always be treated as
-   * whitespace.
-   */
-  WarningAction disallowed_symbol_action;
-
-  const std::string &TempPrefix() const { return sort.temp_prefix; }
-  std::size_t TotalMemory() const { return sort.total_memory; }
-};
-
-// Takes ownership of text_file and out_arpa.
-void Pipeline(PipelineConfig &config, int text_file, Output &output);
-
-}} // namespaces
-#endif // LM_BUILDER_PIPELINE_H

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/common/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/common/CMakeLists.txt b/ext/kenlm/lm/common/CMakeLists.txt
deleted file mode 100644
index 942e24b..0000000
--- a/ext/kenlm/lm/common/CMakeLists.txt
+++ /dev/null
@@ -1,40 +0,0 @@
-cmake_minimum_required(VERSION 2.8.8)
-#
-# The KenLM cmake files make use of add_library(... OBJECTS ...)
-# 
-# This syntax allows grouping of source files when compiling
-# (effectively creating "fake" libraries based on source subdirs).
-# 
-# This syntax was only added in cmake version 2.8.8
-#
-# see http://www.cmake.org/Wiki/CMake/Tutorials/Object_Library
-
-
-# This CMake file was created by Lane Schwartz <do...@gmail.com>
-
-# Explicitly list the source files for this subdirectory
-#
-# If you add any source files to this subdirectory
-#    that should be included in the kenlm library,
-#        (this excludes any unit test files)
-#    you should add them to the following list:
-#
-# In order to set correct paths to these files
-#    in case this variable is referenced by CMake files in the parent directory,
-#    we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}.
-#
-set(KENLM_COMMON_SOURCE 
-		${CMAKE_CURRENT_SOURCE_DIR}/model_buffer.cc
-		${CMAKE_CURRENT_SOURCE_DIR}/print.cc
-		${CMAKE_CURRENT_SOURCE_DIR}/renumber.cc
-		${CMAKE_CURRENT_SOURCE_DIR}/size_option.cc
-	)
-
-
-# Group these objects together for later use. 
-#
-# Given add_library(foo OBJECT ${my_foo_sources}),
-# refer to these objects as $<TARGET_OBJECTS:foo>
-#
-add_library(kenlm_common OBJECT ${KENLM_COMMON_SOURCE})
-

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/common/Jamfile
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/common/Jamfile b/ext/kenlm/lm/common/Jamfile
deleted file mode 100644
index c9bdfd0..0000000
--- a/ext/kenlm/lm/common/Jamfile
+++ /dev/null
@@ -1,2 +0,0 @@
-fakelib common : [ glob *.cc : *test.cc *main.cc ]
-  ../../util//kenutil ../../util/stream//stream ../../util/double-conversion//double-conversion ..//kenlm /top//boost_program_options ;

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/common/compare.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/common/compare.hh b/ext/kenlm/lm/common/compare.hh
deleted file mode 100644
index 1c7cd24..0000000
--- a/ext/kenlm/lm/common/compare.hh
+++ /dev/null
@@ -1,174 +0,0 @@
-#ifndef LM_COMMON_COMPARE_H
-#define LM_COMMON_COMPARE_H
-
-#include "lm/word_index.hh"
-
-#include <functional>
-#include <string>
-
-namespace lm {
-
-/**
- * Abstract parent class for defining custom n-gram comparators.
- */
-template <class Child> class Comparator : public std::binary_function<const void *, const void *, bool> {
-  public:
-
-    /**
-     * Constructs a comparator capable of comparing two n-grams.
-     *
-     * @param order Number of words in each n-gram
-     */
-    explicit Comparator(std::size_t order) : order_(order) {}
-
-    /**
-     * Applies the comparator using the Compare method that must be defined in any class that inherits from this class.
-     *
-     * @param lhs A pointer to the n-gram on the left-hand side of the comparison
-     * @param rhs A pointer to the n-gram on the right-hand side of the comparison
-     *
-     * @see ContextOrder::Compare
-     * @see PrefixOrder::Compare
-     * @see SuffixOrder::Compare
-     */
-    inline bool operator()(const void *lhs, const void *rhs) const {
-      return static_cast<const Child*>(this)->Compare(static_cast<const WordIndex*>(lhs), static_cast<const WordIndex*>(rhs));
-    }
-
-    /** Gets the n-gram order defined for this comparator. */
-    std::size_t Order() const { return order_; }
-
-  protected:
-    std::size_t order_;
-};
-
-/**
- * N-gram comparator that compares n-grams according to their reverse (suffix) order.
- *
- * This comparator compares n-grams lexicographically, one word at a time,
- * beginning with the last word of each n-gram and ending with the first word of each n-gram.
- *
- * Some examples of n-gram comparisons as defined by this comparator:
- * - a b c == a b c
- * - a b c < a b d
- * - a b c > a d b
- * - a b c > a b b
- * - a b c > x a c
- * - a b c < x y z
- */
-class SuffixOrder : public Comparator<SuffixOrder> {
-  public:
-
-    /**
-     * Constructs a comparator capable of comparing two n-grams.
-     *
-     * @param order Number of words in each n-gram
-     */
-    explicit SuffixOrder(std::size_t order) : Comparator<SuffixOrder>(order) {}
-
-    /**
-     * Compares two n-grams lexicographically, one word at a time,
-     * beginning with the last word of each n-gram and ending with the first word of each n-gram.
-     *
-     * @param lhs A pointer to the n-gram on the left-hand side of the comparison
-     * @param rhs A pointer to the n-gram on the right-hand side of the comparison
-     */
-    inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const {
-      for (std::size_t i = order_ - 1; i != 0; --i) {
-        if (lhs[i] != rhs[i])
-          return lhs[i] < rhs[i];
-      }
-      return lhs[0] < rhs[0];
-    }
-
-    static const unsigned kMatchOffset = 1;
-};
-
-
-/**
-  * N-gram comparator that compares n-grams according to the reverse (suffix) order of the n-gram context.
-  *
-  * This comparator compares n-grams lexicographically, one word at a time,
-  * beginning with the penultimate word of each n-gram and ending with the first word of each n-gram;
-  * finally, this comparator compares the last word of each n-gram.
-  *
-  * Some examples of n-gram comparisons as defined by this comparator:
-  * - a b c == a b c
-  * - a b c < a b d
-  * - a b c < a d b
-  * - a b c > a b b
-  * - a b c > x a c
-  * - a b c < x y z
-  */
-class ContextOrder : public Comparator<ContextOrder> {
-  public:
-
-    /**
-     * Constructs a comparator capable of comparing two n-grams.
-     *
-     * @param order Number of words in each n-gram
-     */
-    explicit ContextOrder(std::size_t order) : Comparator<ContextOrder>(order) {}
-
-    /**
-     * Compares two n-grams lexicographically, one word at a time,
-     * beginning with the penultimate word of each n-gram and ending with the first word of each n-gram;
-     * finally, this comparator compares the last word of each n-gram.
-     *
-     * @param lhs A pointer to the n-gram on the left-hand side of the comparison
-     * @param rhs A pointer to the n-gram on the right-hand side of the comparison
-     */
-    inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const {
-      for (int i = order_ - 2; i >= 0; --i) {
-        if (lhs[i] != rhs[i])
-          return lhs[i] < rhs[i];
-      }
-      return lhs[order_ - 1] < rhs[order_ - 1];
-    }
-};
-
-/**
- * N-gram comparator that compares n-grams according to their natural (prefix) order.
- *
- * This comparator compares n-grams lexicographically, one word at a time,
- * beginning with the first word of each n-gram and ending with the last word of each n-gram.
- *
- * Some examples of n-gram comparisons as defined by this comparator:
- * - a b c == a b c
- * - a b c < a b d
- * - a b c < a d b
- * - a b c > a b b
- * - a b c < x a c
- * - a b c < x y z
- */
-class PrefixOrder : public Comparator<PrefixOrder> {
-  public:
-
-    /**
-     * Constructs a comparator capable of comparing two n-grams.
-     *
-     * @param order Number of words in each n-gram
-     */
-    explicit PrefixOrder(std::size_t order) : Comparator<PrefixOrder>(order) {}
-
-    /**
-     * Compares two n-grams lexicographically, one word at a time,
-     * beginning with the first word of each n-gram and ending with the last word of each n-gram.
-     *
-     * @param lhs A pointer to the n-gram on the left-hand side of the comparison
-     * @param rhs A pointer to the n-gram on the right-hand side of the comparison
-     */
-    inline bool Compare(const WordIndex *lhs, const WordIndex *rhs) const {
-      for (std::size_t i = 0; i < order_; ++i) {
-        if (lhs[i] != rhs[i])
-          return lhs[i] < rhs[i];
-      }
-      return false;
-    }
-
-    static const unsigned kMatchOffset = 0;
-};
-
-} // namespace lm
-
-#endif // LM_COMMON_COMPARE_H

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/common/joint_order.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/common/joint_order.hh b/ext/kenlm/lm/common/joint_order.hh
deleted file mode 100644
index 6113bb8..0000000
--- a/ext/kenlm/lm/common/joint_order.hh
+++ /dev/null
@@ -1,71 +0,0 @@
-#ifndef LM_COMMON_JOINT_ORDER_H
-#define LM_COMMON_JOINT_ORDER_H
-
-#include "lm/common/ngram_stream.hh"
-#include "lm/lm_exception.hh"
-
-#ifdef DEBUG
-#include "util/fixed_array.hh"
-#include <iostream>
-#endif
-
-#include <cstring>
-
-namespace lm {
-
-template <class Callback, class Compare> void JointOrder(const util::stream::ChainPositions &positions, Callback &callback) {
-  // Allow matching to reference streams[-1].
-  util::FixedArray<ProxyStream<NGramHeader> > streams_with_dummy(positions.size() + 1);
-  // A bogus stream for [-1].
-  streams_with_dummy.push_back();
-  for (std::size_t i = 0; i < positions.size(); ++i) {
-    streams_with_dummy.push_back(positions[i], NGramHeader(NULL, i + 1));
-  }
-  ProxyStream<NGramHeader> *streams = streams_with_dummy.begin() + 1;
-
-  std::size_t order;
-  for (order = 0; order < positions.size() && streams[order]; ++order) {}
-  assert(order); // should always have <unk>.
-
-  // Debugging only: call comparison function to sanity check order.
-#ifdef DEBUG
-  util::FixedArray<Compare> less_compare(order);
-  for (unsigned i = 0; i < order; ++i)
-    less_compare.push_back(i + 1);
-#endif // DEBUG
-
-  std::size_t current = 0;
-  while (true) {
-    // Does the context match the lower one?
-    if (!memcmp(streams[static_cast<int>(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset, sizeof(WordIndex) * current)) {
-      callback.Enter(current, streams[current].Get());
-      // Transition to looking for extensions.
-      if (++current < order) continue;
-    }
-#ifdef DEBUG
-    // match_check[current - 1] matches current-grams
-    // The lower-order stream (which skips fewer current-grams) should always be <= the higher order-stream (which can skip current-grams).
-    else if (!less_compare[current - 1](streams[static_cast<int>(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset)) {
-      std::cerr << "Stream out of order detected" << std::endl;
-      abort();
-    }
-#endif // DEBUG
-    // No extension left.
-    while(true) {
-      assert(current > 0);
-      --current;
-      callback.Exit(current, streams[current].Get());
-
-      if (++streams[current]) break;
-
-      UTIL_THROW_IF(order != current + 1, FormatLoadException, "Detected n-gram without matching suffix");
-
-      order = current;
-      if (!order) return;
-    }
-  }
-}
-
-} // namespaces
-
-#endif // LM_COMMON_JOINT_ORDER_H

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/common/model_buffer.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/common/model_buffer.cc b/ext/kenlm/lm/common/model_buffer.cc
deleted file mode 100644
index ae9b08c..0000000
--- a/ext/kenlm/lm/common/model_buffer.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-#include "lm/common/model_buffer.hh"
-#include "util/exception.hh"
-#include "util/file_stream.hh"
-#include "util/file.hh"
-#include "util/file_piece.hh"
-#include "util/stream/io.hh"
-#include "util/stream/multi_stream.hh"
-
-#include <boost/lexical_cast.hpp>
-
-namespace lm {
-
-namespace {
-const char kMetadataHeader[] = "KenLM intermediate binary file";
-} // namespace
-
-ModelBuffer::ModelBuffer(StringPiece file_base, bool keep_buffer, bool output_q)
-  : file_base_(file_base.data(), file_base.size()), keep_buffer_(keep_buffer), output_q_(output_q),
-    vocab_file_(keep_buffer ? util::CreateOrThrow((file_base_ + ".vocab").c_str()) : util::MakeTemp(file_base_)) {}
-  
-ModelBuffer::ModelBuffer(StringPiece file_base)
-  : file_base_(file_base.data(), file_base.size()), keep_buffer_(false) {
-  const std::string full_name = file_base_ + ".kenlm_intermediate";
-  util::FilePiece in(full_name.c_str());
-  StringPiece token = in.ReadLine();
-  UTIL_THROW_IF2(token != kMetadataHeader, "File " << full_name << " begins with \"" << token << "\" not " << kMetadataHeader);
-
-  token = in.ReadDelimited();
-  UTIL_THROW_IF2(token != "Counts", "Expected Counts, got \"" << token << "\" in " << full_name);
-  char got;
-  while ((got = in.get()) == ' ') {
-    counts_.push_back(in.ReadULong());
-  }
-  UTIL_THROW_IF2(got != '\n', "Expected newline at end of counts.");
-
-  token = in.ReadDelimited();
-  UTIL_THROW_IF2(token != "Payload", "Expected Payload, got \"" << token << "\" in " << full_name);
-  token = in.ReadDelimited();
-  if (token == "q") {
-    output_q_ = true;
-  } else if (token == "pb") {
-    output_q_ = false;
-  } else {
-    UTIL_THROW(util::Exception, "Unknown payload " << token);
-  }
-
-  vocab_file_.reset(util::OpenReadOrThrow((file_base_ + ".vocab").c_str()));
-
-  files_.Init(counts_.size());
-  for (unsigned long i = 0; i < counts_.size(); ++i) {
-    files_.push_back(util::OpenReadOrThrow((file_base_ + '.' + boost::lexical_cast<std::string>(i + 1)).c_str()));
-  }
-}
-
-void ModelBuffer::Sink(util::stream::Chains &chains, const std::vector<uint64_t> &counts) {
-  counts_ = counts;
-  // Open files.
-  files_.Init(chains.size());
-  for (std::size_t i = 0; i < chains.size(); ++i) {
-    if (keep_buffer_) {
-      files_.push_back(util::CreateOrThrow(
-            (file_base_ + '.' + boost::lexical_cast<std::string>(i + 1)).c_str()
-            ));
-    } else {
-      files_.push_back(util::MakeTemp(file_base_));
-    }
-    chains[i] >> util::stream::Write(files_.back().get());
-  }
-  if (keep_buffer_) {
-    util::scoped_fd metadata(util::CreateOrThrow((file_base_ + ".kenlm_intermediate").c_str()));
-    util::FileStream meta(metadata.get(), 200);
-    meta << kMetadataHeader << "\nCounts";
-    for (std::vector<uint64_t>::const_iterator i = counts_.begin(); i != counts_.end(); ++i) {
-      meta << ' ' << *i;
-    }
-    meta << "\nPayload " << (output_q_ ? "q" : "pb") << '\n';
-  }
-}
-
-void ModelBuffer::Source(util::stream::Chains &chains) {
-  assert(chains.size() <= files_.size());
-  for (unsigned int i = 0; i < chains.size(); ++i) {
-    chains[i] >> util::stream::PRead(files_[i].get());
-  }
-}
-
-void ModelBuffer::Source(std::size_t order_minus_1, util::stream::Chain &chain) {
-  chain >> util::stream::PRead(files_[order_minus_1].get());
-}
-
-} // namespace

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/common/model_buffer.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/common/model_buffer.hh b/ext/kenlm/lm/common/model_buffer.hh
deleted file mode 100644
index 92662bb..0000000
--- a/ext/kenlm/lm/common/model_buffer.hh
+++ /dev/null
@@ -1,63 +0,0 @@
-#ifndef LM_COMMON_MODEL_BUFFER_H
-#define LM_COMMON_MODEL_BUFFER_H
-
-/* Format with separate files in suffix order.  Each file contains
- * n-grams of the same order.
- */
-
-#include "util/file.hh"
-#include "util/fixed_array.hh"
-
-#include <string>
-#include <vector>
-
-namespace util { namespace stream {
-class Chains;
-class Chain;
-}} // namespaces
-
-namespace lm {
-
-class ModelBuffer {
-  public:
-    // Construct for writing.  Must call VocabFile() and fill it with null-delimited vocab words.
-    ModelBuffer(StringPiece file_base, bool keep_buffer, bool output_q);
-
-    // Load from file.
-    explicit ModelBuffer(StringPiece file_base);
-
-    // Must call VocabFile and populate before calling this function.
-    void Sink(util::stream::Chains &chains, const std::vector<uint64_t> &counts);
-
-    // Read files and write to the given chains.  If fewer chains are provided,
-    // only do the lower orders.
-    void Source(util::stream::Chains &chains);
-
-    void Source(std::size_t order_minus_1, util::stream::Chain &chain);
-
-    // The order of the n-gram model that is associated with the model buffer.
-    std::size_t Order() const { return counts_.size(); }
-    // Requires Sink or load from file.
-    const std::vector<uint64_t> &Counts() const {
-      assert(!counts_.empty());
-      return counts_;
-    }
-
-    int VocabFile() const { return vocab_file_.get(); }
-    int StealVocabFile() { return vocab_file_.release(); }
-
-    bool Keep() const { return keep_buffer_; }
-
-  private:
-    const std::string file_base_;
-    const bool keep_buffer_;
-    bool output_q_;
-    std::vector<uint64_t> counts_;
-
-    util::scoped_fd vocab_file_;
-    util::FixedArray<util::scoped_fd> files_;
-};
-
-} // namespace lm
-
-#endif // LM_COMMON_MODEL_BUFFER_H

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/common/ngram.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/common/ngram.hh b/ext/kenlm/lm/common/ngram.hh
deleted file mode 100644
index 7a6d1c3..0000000
--- a/ext/kenlm/lm/common/ngram.hh
+++ /dev/null
@@ -1,77 +0,0 @@
-#ifndef LM_COMMON_NGRAM_H
-#define LM_COMMON_NGRAM_H
-
-#include "lm/weights.hh"
-#include "lm/word_index.hh"
-
-#include <cstddef>
-#include <cassert>
-#include <stdint.h>
-#include <cstring>
-
-namespace lm {
-
-class NGramHeader {
-  public:
-    NGramHeader(void *begin, std::size_t order)
-      : begin_(static_cast<WordIndex*>(begin)), end_(begin_ + order) {}
-
-    NGramHeader() : begin_(NULL), end_(NULL) {}
-
-    const uint8_t *Base() const { return reinterpret_cast<const uint8_t*>(begin_); }
-    uint8_t *Base() { return reinterpret_cast<uint8_t*>(begin_); }
-
-    void ReBase(void *to) {
-      std::size_t difference = end_ - begin_;
-      begin_ = reinterpret_cast<WordIndex*>(to);
-      end_ = begin_ + difference;
-    }
-
-    // These are for the vocab index.
-    // Lower-case in deference to STL.
-    const WordIndex *begin() const { return begin_; }
-    WordIndex *begin() { return begin_; }
-    const WordIndex *end() const { return end_; }
-    WordIndex *end() { return end_; }
-
-    std::size_t size() const { return end_ - begin_; }
-    std::size_t Order() const { return end_ - begin_; }
-
-  private:
-    WordIndex *begin_, *end_;
-};
-
-template <class PayloadT> class NGram : public NGramHeader {
-  public:
-    typedef PayloadT Payload;
-
-    NGram() : NGramHeader(NULL, 0) {}
-
-    NGram(void *begin, std::size_t order) : NGramHeader(begin, order) {}
-
-    // Would do operator++ but that can get confusing for a stream.
-    void NextInMemory() {
-      ReBase(&Value() + 1);
-    }
-
-    static std::size_t TotalSize(std::size_t order) {
-      return order * sizeof(WordIndex) + sizeof(Payload);
-    }
-    std::size_t TotalSize() const {
-      // Compiler should optimize this.
-      return TotalSize(Order());
-    }
-
-    static std::size_t OrderFromSize(std::size_t size) {
-      std::size_t ret = (size - sizeof(Payload)) / sizeof(WordIndex);
-      assert(size == TotalSize(ret));
-      return ret;
-    }
-
-    const Payload &Value() const { return *reinterpret_cast<const Payload *>(end()); }
-    Payload &Value() { return *reinterpret_cast<Payload *>(end()); }
-};
-
-} // namespace lm
-
-#endif // LM_COMMON_NGRAM_H

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/common/ngram_stream.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/common/ngram_stream.hh b/ext/kenlm/lm/common/ngram_stream.hh
deleted file mode 100644
index 8bdf36e..0000000
--- a/ext/kenlm/lm/common/ngram_stream.hh
+++ /dev/null
@@ -1,65 +0,0 @@
-#ifndef LM_BUILDER_NGRAM_STREAM_H
-#define LM_BUILDER_NGRAM_STREAM_H
-
-#include "lm/common/ngram.hh"
-#include "util/stream/chain.hh"
-#include "util/stream/multi_stream.hh"
-#include "util/stream/stream.hh"
-
-#include <cstddef>
-
-namespace lm {
-
-template <class Proxy> class ProxyStream {
-  public:
-    // Make an invalid stream.
-    ProxyStream() {}
-
-    explicit ProxyStream(const util::stream::ChainPosition &position, const Proxy &proxy = Proxy())
-      : proxy_(proxy), stream_(position) {
-      proxy_.ReBase(stream_.Get());
-    }
-
-    Proxy &operator*() { return proxy_; }
-    const Proxy &operator*() const { return proxy_; }
-
-    Proxy *operator->() { return &proxy_; }
-    const Proxy *operator->() const { return &proxy_; }
-
-    void *Get() { return stream_.Get(); }
-    const void *Get() const { return stream_.Get(); }
-
-    operator bool() const { return stream_; }
-    bool operator!() const { return !stream_; }
-    void Poison() { stream_.Poison(); }
-
-    ProxyStream<Proxy> &operator++() {
-      ++stream_;
-      proxy_.ReBase(stream_.Get());
-      return *this;
-    }
-
-  private:
-    Proxy proxy_;
-    util::stream::Stream stream_;
-};
-
-template <class Payload> class NGramStream : public ProxyStream<NGram<Payload> > {
-  public:
-    // Make an invalid stream.
-    NGramStream() {}
-
-    explicit NGramStream(const util::stream::ChainPosition &position) :
-      ProxyStream<NGram<Payload> >(position, NGram<Payload>(NULL, NGram<Payload>::OrderFromSize(position.GetChain().EntrySize()))) {}
-};
-
-template <class Payload> class NGramStreams : public util::stream::GenericStreams<NGramStream<Payload> > {
-  private:
-    typedef util::stream::GenericStreams<NGramStream<Payload> > P;
-  public:
-    NGramStreams() : P() {}
-    NGramStreams(const util::stream::ChainPositions &positions) : P(positions) {}
-};
-
-} // namespace
-#endif // LM_BUILDER_NGRAM_STREAM_H