You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/19 21:33:57 UTC

[09/51] [partial] incubator-joshua git commit: Converted KenLM into a submodule

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/ngram_query.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/ngram_query.hh b/ext/kenlm/lm/ngram_query.hh
deleted file mode 100644
index 4430841..0000000
--- a/ext/kenlm/lm/ngram_query.hh
+++ /dev/null
@@ -1,113 +0,0 @@
-#ifndef LM_NGRAM_QUERY_H
-#define LM_NGRAM_QUERY_H
-
-#include "lm/enumerate_vocab.hh"
-#include "lm/model.hh"
-#include "util/file_stream.hh"
-#include "util/file_piece.hh"
-#include "util/usage.hh"
-
-#include <cstdlib>
-#include <string>
-#include <cmath>
-
-namespace lm {
-namespace ngram {
-
-class QueryPrinter {
-  public:
-    QueryPrinter(int fd, bool print_word, bool print_line, bool print_summary, bool flush)
-      : out_(fd), print_word_(print_word), print_line_(print_line), print_summary_(print_summary), flush_(flush) {}
-
-    void Word(StringPiece surface, WordIndex vocab, const FullScoreReturn &ret) {
-      if (!print_word_) return;
-      out_ << surface << '=' << vocab << ' ' << static_cast<unsigned int>(ret.ngram_length)  << ' ' << ret.prob << '\t';
-      if (flush_) out_.flush();
-    }
-
-    void Line(uint64_t oov, float total) {
-      if (!print_line_) return;
-      out_ << "Total: " << total << " OOV: " << oov << '\n';
-      if (flush_) out_.flush();
-    }
-
-    void Summary(double ppl_including_oov, double ppl_excluding_oov, uint64_t corpus_oov, uint64_t corpus_tokens) {
-      if (!print_summary_) return;
-      out_ <<
-        "Perplexity including OOVs:\t" << ppl_including_oov << "\n"
-        "Perplexity excluding OOVs:\t" << ppl_excluding_oov << "\n"
-        "OOVs:\t" << corpus_oov << "\n"
-        "Tokens:\t" << corpus_tokens << '\n';
-      out_.flush();
-    }
-
-  private:
-    util::FileStream out_;
-    bool print_word_;
-    bool print_line_;
-    bool print_summary_;
-    bool flush_;
-};
-
-template <class Model, class Printer> void Query(const Model &model, bool sentence_context, Printer &printer) {
-  typename Model::State state, out;
-  lm::FullScoreReturn ret;
-  StringPiece word;
-
-  util::FilePiece in(0);
-
-  double corpus_total = 0.0;
-  double corpus_total_oov_only = 0.0;
-  uint64_t corpus_oov = 0;
-  uint64_t corpus_tokens = 0;
-
-  while (true) {
-    state = sentence_context ? model.BeginSentenceState() : model.NullContextState();
-    float total = 0.0;
-    uint64_t oov = 0;
-
-    while (in.ReadWordSameLine(word)) {
-      lm::WordIndex vocab = model.GetVocabulary().Index(word);
-      ret = model.FullScore(state, vocab, out);
-      if (vocab == model.GetVocabulary().NotFound()) {
-        ++oov;
-        corpus_total_oov_only += ret.prob;
-      }
-      total += ret.prob;
-      printer.Word(word, vocab, ret);
-      ++corpus_tokens;
-      state = out;
-    }
-    // If people don't have a newline after their last query, this won't add a </s>.
-    // Sue me.
-    try {
-      UTIL_THROW_IF('\n' != in.get(), util::Exception, "FilePiece is confused.");
-    } catch (const util::EndOfFileException &e) { break; }
-    if (sentence_context) {
-      ret = model.FullScore(state, model.GetVocabulary().EndSentence(), out);
-      total += ret.prob;
-      ++corpus_tokens;
-      printer.Word("</s>", model.GetVocabulary().EndSentence(), ret);
-    }
-    printer.Line(oov, total);
-    corpus_total += total;
-    corpus_oov += oov;
-  }
-  printer.Summary(
-      pow(10.0, -(corpus_total / static_cast<double>(corpus_tokens))), // PPL including OOVs
-      pow(10.0, -((corpus_total - corpus_total_oov_only) / static_cast<double>(corpus_tokens - corpus_oov))), // PPL excluding OOVs
-      corpus_oov,
-      corpus_tokens);
-}
-
-template <class Model> void Query(const char *file, const Config &config, bool sentence_context, QueryPrinter &printer) {
-  Model model(file, config);
-  Query<Model, QueryPrinter>(model, sentence_context, printer);
-}
-
-} // namespace ngram
-} // namespace lm
-
-#endif // LM_NGRAM_QUERY_H
-
-

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/partial.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/partial.hh b/ext/kenlm/lm/partial.hh
deleted file mode 100644
index 9e4e352..0000000
--- a/ext/kenlm/lm/partial.hh
+++ /dev/null
@@ -1,166 +0,0 @@
-#ifndef LM_PARTIAL_H
-#define LM_PARTIAL_H
-
-#include "lm/return.hh"
-#include "lm/state.hh"
-
-#include <algorithm>
-#include <cassert>
-
-namespace lm {
-namespace ngram {
-
-struct ExtendReturn {
-  float adjust;
-  bool make_full;
-  unsigned char next_use;
-};
-
-template <class Model> ExtendReturn ExtendLoop(
-    const Model &model,
-    unsigned char seen, const WordIndex *add_rbegin, const WordIndex *add_rend, const float *backoff_start,
-    const uint64_t *pointers, const uint64_t *pointers_end,
-    uint64_t *&pointers_write,
-    float *backoff_write) {
-  unsigned char add_length = add_rend - add_rbegin;
-
-  float backoff_buf[2][KENLM_MAX_ORDER - 1];
-  float *backoff_in = backoff_buf[0], *backoff_out = backoff_buf[1];
-  std::copy(backoff_start, backoff_start + add_length, backoff_in);
-
-  ExtendReturn value;
-  value.make_full = false;
-  value.adjust = 0.0;
-  value.next_use = add_length;
-
-  unsigned char i = 0;
-  unsigned char length = pointers_end - pointers;
-  // pointers_write is NULL means that the existing left state is full, so we should use completed probabilities.
-  if (pointers_write) {
-    // Using full context, writing to new left state.
-    for (; i < length; ++i) {
-      FullScoreReturn ret(model.ExtendLeft(
-          add_rbegin, add_rbegin + value.next_use,
-          backoff_in,
-          pointers[i], i + seen + 1,
-          backoff_out,
-          value.next_use));
-      std::swap(backoff_in, backoff_out);
-      if (ret.independent_left) {
-        value.adjust += ret.prob;
-        value.make_full = true;
-        ++i;
-        break;
-      }
-      value.adjust += ret.rest;
-      *pointers_write++ = ret.extend_left;
-      if (value.next_use != add_length) {
-        value.make_full = true;
-        ++i;
-        break;
-      }
-    }
-  }
-  // Using some of the new context.
-  for (; i < length && value.next_use; ++i) {
-    FullScoreReturn ret(model.ExtendLeft(
-        add_rbegin, add_rbegin + value.next_use,
-        backoff_in,
-        pointers[i], i + seen + 1,
-        backoff_out,
-        value.next_use));
-    std::swap(backoff_in, backoff_out);
-    value.adjust += ret.prob;
-  }
-  float unrest = model.UnRest(pointers + i, pointers_end, i + seen + 1);
-  // Using none of the new context.
-  value.adjust += unrest;
-
-  std::copy(backoff_in, backoff_in + value.next_use, backoff_write);
-  return value;
-}
-
-template <class Model> float RevealBefore(const Model &model, const Right &reveal, const unsigned char seen, bool reveal_full, Left &left, Right &right) {
-  assert(seen < reveal.length || reveal_full);
-  uint64_t *pointers_write = reveal_full ? NULL : left.pointers;
-  float backoff_buffer[KENLM_MAX_ORDER - 1];
-  ExtendReturn value(ExtendLoop(
-      model,
-      seen, reveal.words + seen, reveal.words + reveal.length, reveal.backoff + seen,
-      left.pointers, left.pointers + left.length,
-      pointers_write,
-      left.full ? backoff_buffer : (right.backoff + right.length)));
-  if (reveal_full) {
-    left.length = 0;
-    value.make_full = true;
-  } else {
-    left.length = pointers_write - left.pointers;
-    value.make_full |= (left.length == model.Order() - 1);
-  }
-  if (left.full) {
-    for (unsigned char i = 0; i < value.next_use; ++i) value.adjust += backoff_buffer[i];
-  } else {
-    // If left wasn't full when it came in, put words into right state.
-    std::copy(reveal.words + seen, reveal.words + seen + value.next_use, right.words + right.length);
-    right.length += value.next_use;
-    left.full = value.make_full || (right.length == model.Order() - 1);
-  }
-  return value.adjust;
-}
-
-template <class Model> float RevealAfter(const Model &model, Left &left, Right &right, const Left &reveal, unsigned char seen) {
-  assert(seen < reveal.length || reveal.full);
-  uint64_t *pointers_write = left.full ? NULL : (left.pointers + left.length);
-  ExtendReturn value(ExtendLoop(
-      model,
-      seen, right.words, right.words + right.length, right.backoff,
-      reveal.pointers + seen, reveal.pointers + reveal.length,
-      pointers_write,
-      right.backoff));
-  if (reveal.full) {
-    for (unsigned char i = 0; i < value.next_use; ++i) value.adjust += right.backoff[i];
-    right.length = 0;
-    value.make_full = true;
-  } else {
-    right.length = value.next_use;
-    value.make_full |= (right.length == model.Order() - 1);
-  }
-  if (!left.full) {
-    left.length = pointers_write - left.pointers;
-    left.full = value.make_full || (left.length == model.Order() - 1);
-  }
-  return value.adjust;
-}
-
-template <class Model> float Subsume(const Model &model, Left &first_left, const Right &first_right, const Left &second_left, Right &second_right, const unsigned int between_length) {
-  assert(first_right.length < KENLM_MAX_ORDER);
-  assert(second_left.length < KENLM_MAX_ORDER);
-  assert(between_length < KENLM_MAX_ORDER - 1);
-  uint64_t *pointers_write = first_left.full ? NULL : (first_left.pointers + first_left.length);
-  float backoff_buffer[KENLM_MAX_ORDER - 1];
-  ExtendReturn value(ExtendLoop(
-        model,
-        between_length, first_right.words, first_right.words + first_right.length, first_right.backoff,
-        second_left.pointers, second_left.pointers + second_left.length,
-        pointers_write,
-        second_left.full ? backoff_buffer : (second_right.backoff + second_right.length)));
-  if (second_left.full) {
-    for (unsigned char i = 0; i < value.next_use; ++i) value.adjust += backoff_buffer[i];
-  } else {
-    std::copy(first_right.words, first_right.words + value.next_use, second_right.words + second_right.length);
-    second_right.length += value.next_use;
-    value.make_full |= (second_right.length == model.Order() - 1);
-  }
-  if (!first_left.full) {
-    first_left.length = pointers_write - first_left.pointers;
-    first_left.full = value.make_full || second_left.full || (first_left.length == model.Order() - 1);
-  }
-  assert(first_left.length < KENLM_MAX_ORDER);
-  assert(second_right.length < KENLM_MAX_ORDER);
-  return value.adjust;
-}
-
-} // namespace ngram
-} // namespace lm
-
-#endif // LM_PARTIAL_H

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/partial_test.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/partial_test.cc b/ext/kenlm/lm/partial_test.cc
deleted file mode 100644
index adb644f..0000000
--- a/ext/kenlm/lm/partial_test.cc
+++ /dev/null
@@ -1,199 +0,0 @@
-#include "lm/partial.hh"
-
-#include "lm/left.hh"
-#include "lm/model.hh"
-#include "util/tokenize_piece.hh"
-
-#define BOOST_TEST_MODULE PartialTest
-#include <boost/test/unit_test.hpp>
-#include <boost/test/floating_point_comparison.hpp>
-
-namespace lm {
-namespace ngram {
-namespace {
-
-const char *TestLocation() {
-  if (boost::unit_test::framework::master_test_suite().argc < 2) {
-    return "test.arpa";
-  }
-  return boost::unit_test::framework::master_test_suite().argv[1];
-}
-
-Config SilentConfig() {
-  Config config;
-  config.arpa_complain = Config::NONE;
-  config.messages = NULL;
-  return config;
-}
-
-struct ModelFixture {
-  ModelFixture() : m(TestLocation(), SilentConfig()) {}
-
-  RestProbingModel m;
-};
-
-BOOST_FIXTURE_TEST_SUITE(suite, ModelFixture)
-
-BOOST_AUTO_TEST_CASE(SimpleBefore) {
-  Left left;
-  left.full = false;
-  left.length = 0;
-  Right right;
-  right.length = 0;
-
-  Right reveal;
-  reveal.length = 1;
-  WordIndex period = m.GetVocabulary().Index(".");
-  reveal.words[0] = period;
-  reveal.backoff[0] = -0.845098;
-
-  BOOST_CHECK_CLOSE(0.0, RevealBefore(m, reveal, 0, false, left, right), 0.001);
-  BOOST_CHECK_EQUAL(0, left.length);
-  BOOST_CHECK(!left.full);
-  BOOST_CHECK_EQUAL(1, right.length);
-  BOOST_CHECK_EQUAL(period, right.words[0]);
-  BOOST_CHECK_CLOSE(-0.845098, right.backoff[0], 0.001);
-
-  WordIndex more = m.GetVocabulary().Index("more");
-  reveal.words[1] = more;
-  reveal.backoff[1] =  -0.4771212;
-  reveal.length = 2;
-  BOOST_CHECK_CLOSE(0.0, RevealBefore(m, reveal, 1, false, left, right), 0.001);
-  BOOST_CHECK_EQUAL(0, left.length);
-  BOOST_CHECK(!left.full);
-  BOOST_CHECK_EQUAL(2, right.length);
-  BOOST_CHECK_EQUAL(period, right.words[0]);
-  BOOST_CHECK_EQUAL(more, right.words[1]);
-  BOOST_CHECK_CLOSE(-0.845098, right.backoff[0], 0.001);
-  BOOST_CHECK_CLOSE(-0.4771212, right.backoff[1], 0.001);
-}
-
-BOOST_AUTO_TEST_CASE(AlsoWouldConsider) {
-  WordIndex would = m.GetVocabulary().Index("would");
-  WordIndex consider = m.GetVocabulary().Index("consider");
-
-  ChartState current;
-  current.left.length = 1;
-  current.left.pointers[0] = would;
-  current.left.full = false;
-  current.right.length = 1;
-  current.right.words[0] = would;
-  current.right.backoff[0] = -0.30103;
-
-  Left after;
-  after.full = false;
-  after.length = 1;
-  after.pointers[0] = consider;
-
-  // adjustment for would consider
-  BOOST_CHECK_CLOSE(-1.687872 - -0.2922095 - 0.30103, RevealAfter(m, current.left, current.right, after, 0), 0.001);
-
-  BOOST_CHECK_EQUAL(2, current.left.length);
-  BOOST_CHECK_EQUAL(would, current.left.pointers[0]);
-  BOOST_CHECK_EQUAL(false, current.left.full);
-
-  WordIndex also = m.GetVocabulary().Index("also");
-  Right before;
-  before.length = 1;
-  before.words[0] = also;
-  before.backoff[0] = -0.30103;
-  // r(would) = -0.2922095 [i would], r(would -> consider) = -1.988902 [b(would) + p(consider)]
-  // p(also -> would) = -2, p(also would -> consider) = -3
-  BOOST_CHECK_CLOSE(-2 + 0.2922095 -3 + 1.988902, RevealBefore(m, before, 0, false, current.left, current.right), 0.001);
-  BOOST_CHECK_EQUAL(0, current.left.length);
-  BOOST_CHECK(current.left.full);
-  BOOST_CHECK_EQUAL(2, current.right.length);
-  BOOST_CHECK_EQUAL(would, current.right.words[0]);
-  BOOST_CHECK_EQUAL(also, current.right.words[1]);
-}
-
-BOOST_AUTO_TEST_CASE(EndSentence) {
-  WordIndex loin = m.GetVocabulary().Index("loin");
-  WordIndex period = m.GetVocabulary().Index(".");
-  WordIndex eos = m.GetVocabulary().EndSentence();
-
-  ChartState between;
-  between.left.length = 1;
-  between.left.pointers[0] = eos;
-  between.left.full = true;
-  between.right.length = 0;
-
-  Right before;
-  before.words[0] = period;
-  before.words[1] = loin;
-  before.backoff[0] = -0.845098;
-  before.backoff[1] = 0.0;
-
-  before.length = 1;
-  BOOST_CHECK_CLOSE(-0.0410707, RevealBefore(m, before, 0, true, between.left, between.right), 0.001);
-  BOOST_CHECK_EQUAL(0, between.left.length);
-}
-
-float ScoreFragment(const RestProbingModel &model, unsigned int *begin, unsigned int *end, ChartState &out) {
-  RuleScore<RestProbingModel> scorer(model, out);
-  for (unsigned int *i = begin; i < end; ++i) {
-    scorer.Terminal(*i);
-  }
-  return scorer.Finish();
-}
-
-void CheckAdjustment(const RestProbingModel &model, float expect, const Right &before_in, bool before_full, ChartState between, const Left &after_in) {
-  Right before(before_in);
-  Left after(after_in);
-  after.full = false;
-  float got = 0.0;
-  for (unsigned int i = 1; i < 5; ++i) {
-    if (before_in.length >= i) {
-      before.length = i;
-      got += RevealBefore(model, before, i - 1, false, between.left, between.right);
-    }
-    if (after_in.length >= i) {
-      after.length = i;
-      got += RevealAfter(model, between.left, between.right, after, i - 1);
-    }
-  }
-  if (after_in.full) {
-    after.full = true;
-    got += RevealAfter(model, between.left, between.right, after, after.length);
-  }
-  if (before_full) {
-    got += RevealBefore(model, before, before.length, true, between.left, between.right);
-  }
-  // Sometimes they're zero and BOOST_CHECK_CLOSE fails for this.
-  BOOST_CHECK(fabs(expect - got) < 0.001);
-}
-
-void FullDivide(const RestProbingModel &model, StringPiece str) {
-  std::vector<WordIndex> indices;
-  for (util::TokenIter<util::SingleCharacter, true> i(str, ' '); i; ++i) {
-    indices.push_back(model.GetVocabulary().Index(*i));
-  }
-  ChartState full_state;
-  float full = ScoreFragment(model, &indices.front(), &indices.back() + 1, full_state);
-
-  ChartState before_state;
-  before_state.left.full = false;
-  RuleScore<RestProbingModel> before_scorer(model, before_state);
-  float before_score = 0.0;
-  for (unsigned int before = 0; before < indices.size(); ++before) {
-    for (unsigned int after = before; after <= indices.size(); ++after) {
-      ChartState after_state, between_state;
-      float after_score = ScoreFragment(model, &indices.front() + after, &indices.front() + indices.size(), after_state);
-      float between_score = ScoreFragment(model, &indices.front() + before, &indices.front() + after, between_state);
-      CheckAdjustment(model, full - before_score - after_score - between_score, before_state.right, before_state.left.full, between_state, after_state.left);
-    }
-    before_scorer.Terminal(indices[before]);
-    before_score = before_scorer.Finish();
-  }
-}
-
-BOOST_AUTO_TEST_CASE(Strings) {
-  FullDivide(m, "also would consider");
-  FullDivide(m, "looking on a little more loin . </s>");
-  FullDivide(m, "in biarritz watching considering looking . on a little more loin also would consider higher to look good unknown the screening foo bar , unknown however unknown </s>");
-}
-
-BOOST_AUTO_TEST_SUITE_END()
-} // namespace
-} // namespace ngram
-} // namespace lm

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/quantize.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/quantize.cc b/ext/kenlm/lm/quantize.cc
deleted file mode 100644
index 02b5dbc..0000000
--- a/ext/kenlm/lm/quantize.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-/* Quantize into bins of equal size as described in
- * M. Federico and N. Bertoldi. 2006. How many bits are needed
- * to store probabilities for phrase-based translation? In Proc.
- * of the Workshop on Statistical Machine Translation, pages
- * 94–101, New York City, June. Association for Computa-
- * tional Linguistics.
- */
-
-#include "lm/quantize.hh"
-
-#include "lm/binary_format.hh"
-#include "lm/lm_exception.hh"
-#include "util/file.hh"
-
-#include <algorithm>
-#include <numeric>
-
-namespace lm {
-namespace ngram {
-
-namespace {
-
-void MakeBins(std::vector<float> &values, float *centers, uint32_t bins) {
-  std::sort(values.begin(), values.end());
-  std::vector<float>::const_iterator start = values.begin(), finish;
-  for (uint32_t i = 0; i < bins; ++i, ++centers, start = finish) {
-    finish = values.begin() + ((values.size() * static_cast<uint64_t>(i + 1)) / bins);
-    if (finish == start) {
-      // zero length bucket.
-      *centers = i ? *(centers - 1) : -std::numeric_limits<float>::infinity();
-    } else {
-      *centers = std::accumulate(start, finish, 0.0) / static_cast<float>(finish - start);
-    }
-  }
-}
-
-const char kSeparatelyQuantizeVersion = 2;
-
-} // namespace
-
-void SeparatelyQuantize::UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config) {
-  unsigned char buffer[3];
-  file.ReadForConfig(buffer, 3, offset);
-  char version = buffer[0];
-  config.prob_bits = buffer[1];
-  config.backoff_bits = buffer[2];
-  if (version != kSeparatelyQuantizeVersion) UTIL_THROW(FormatLoadException, "This file has quantization version " << (unsigned)version << " but the code expects version " << (unsigned)kSeparatelyQuantizeVersion);
-}
-
-void SeparatelyQuantize::SetupMemory(void *base, unsigned char order, const Config &config) {
-  prob_bits_ = config.prob_bits;
-  backoff_bits_ = config.backoff_bits;
-  // We need the reserved values.
-  if (config.prob_bits == 0) UTIL_THROW(ConfigException, "You can't quantize probability to zero");
-  if (config.backoff_bits == 0) UTIL_THROW(ConfigException, "You can't quantize backoff to zero");
-  if (config.prob_bits > 25) UTIL_THROW(ConfigException, "For efficiency reasons, quantizing probability supports at most 25 bits.  Currently you have requested " << static_cast<unsigned>(config.prob_bits) << " bits.");
-  if (config.backoff_bits > 25) UTIL_THROW(ConfigException, "For efficiency reasons, quantizing backoff supports at most 25 bits.  Currently you have requested " << static_cast<unsigned>(config.backoff_bits) << " bits.");
-  // Reserve 8 byte header for bit counts.
-  actual_base_ = static_cast<uint8_t*>(base);
-  float *start = reinterpret_cast<float*>(actual_base_ + 8);
-  for (unsigned char i = 0; i < order - 2; ++i) {
-    tables_[i][0] = Bins(prob_bits_, start);
-    start += (1ULL << prob_bits_);
-    tables_[i][1] = Bins(backoff_bits_, start);
-    start += (1ULL << backoff_bits_);
-  }
-  longest_ = tables_[order - 2][0] = Bins(prob_bits_, start);
-}
-
-void SeparatelyQuantize::Train(uint8_t order, std::vector<float> &prob, std::vector<float> &backoff) {
-  TrainProb(order, prob);
-
-  // Backoff
-  float *centers = tables_[order - 2][1].Populate();
-  *(centers++) = kNoExtensionBackoff;
-  *(centers++) = kExtensionBackoff;
-  MakeBins(backoff, centers, (1ULL << backoff_bits_) - 2);
-}
-
-void SeparatelyQuantize::TrainProb(uint8_t order, std::vector<float> &prob) {
-  float *centers = tables_[order - 2][0].Populate();
-  MakeBins(prob, centers, (1ULL << prob_bits_));
-}
-
-void SeparatelyQuantize::FinishedLoading(const Config &config) {
-  uint8_t *actual_base = actual_base_;
-  *(actual_base++) = kSeparatelyQuantizeVersion; // version
-  *(actual_base++) = config.prob_bits;
-  *(actual_base++) = config.backoff_bits;
-}
-
-} // namespace ngram
-} // namespace lm

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/quantize.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/quantize.hh b/ext/kenlm/lm/quantize.hh
deleted file mode 100644
index 8500ace..0000000
--- a/ext/kenlm/lm/quantize.hh
+++ /dev/null
@@ -1,233 +0,0 @@
-#ifndef LM_QUANTIZE_H
-#define LM_QUANTIZE_H
-
-#include "lm/blank.hh"
-#include "lm/config.hh"
-#include "lm/max_order.hh"
-#include "lm/model_type.hh"
-#include "util/bit_packing.hh"
-
-#include <algorithm>
-#include <vector>
-
-#include <stdint.h>
-
-#include <iostream>
-
-namespace lm {
-namespace ngram {
-
-struct Config;
-class BinaryFormat;
-
-/* Store values directly and don't quantize. */
-class DontQuantize {
-  public:
-    static const ModelType kModelTypeAdd = static_cast<ModelType>(0);
-    static void UpdateConfigFromBinary(const BinaryFormat &, uint64_t, Config &) {}
-    static uint64_t Size(uint8_t /*order*/, const Config &/*config*/) { return 0; }
-    static uint8_t MiddleBits(const Config &/*config*/) { return 63; }
-    static uint8_t LongestBits(const Config &/*config*/) { return 31; }
-
-    class MiddlePointer {
-      public:
-        MiddlePointer(const DontQuantize & /*quant*/, unsigned char /*order_minus_2*/, util::BitAddress address) : address_(address) {}
-
-        MiddlePointer() : address_(NULL, 0) {}
-
-        bool Found() const {
-          return address_.base != NULL;
-        }
-
-        float Prob() const {
-          return util::ReadNonPositiveFloat31(address_.base, address_.offset);
-        }
-
-        float Backoff() const {
-          return util::ReadFloat32(address_.base, address_.offset + 31);
-        }
-
-        float Rest() const { return Prob(); }
-
-        void Write(float prob, float backoff) {
-          util::WriteNonPositiveFloat31(address_.base, address_.offset, prob);
-          util::WriteFloat32(address_.base, address_.offset + 31, backoff);
-        }
-
-      private:
-        util::BitAddress address_;
-    };
-
-    class LongestPointer {
-      public:
-        explicit LongestPointer(const DontQuantize &/*quant*/, util::BitAddress address) : address_(address) {}
-
-        LongestPointer() : address_(NULL, 0) {}
-
-        bool Found() const {
-          return address_.base != NULL;
-        }
-
-        float Prob() const {
-          return util::ReadNonPositiveFloat31(address_.base, address_.offset);
-        }
-
-        void Write(float prob) {
-          util::WriteNonPositiveFloat31(address_.base, address_.offset, prob);
-        }
-
-      private:
-        util::BitAddress address_;
-    };
-
-    DontQuantize() {}
-
-    void SetupMemory(void * /*start*/, unsigned char /*order*/, const Config & /*config*/) {}
-
-    static const bool kTrain = false;
-    // These should never be called because kTrain is false.
-    void Train(uint8_t /*order*/, std::vector<float> &/*prob*/, std::vector<float> &/*backoff*/) {}
-    void TrainProb(uint8_t, std::vector<float> &/*prob*/) {}
-
-    void FinishedLoading(const Config &) {}
-};
-
-class SeparatelyQuantize {
-  private:
-    class Bins {
-      public:
-        // Sigh C++ default constructor
-        Bins() {}
-
-        Bins(uint8_t bits, float *begin) : begin_(begin), end_(begin_ + (1ULL << bits)), bits_(bits), mask_((1ULL << bits) - 1) {}
-
-        float *Populate() { return begin_; }
-
-        uint64_t EncodeProb(float value) const {
-          return Encode(value, 0);
-        }
-
-        uint64_t EncodeBackoff(float value) const {
-          if (value == 0.0) {
-            return HasExtension(value) ? kExtensionQuant : kNoExtensionQuant;
-          }
-          return Encode(value, 2);
-        }
-
-        float Decode(std::size_t off) const { return begin_[off]; }
-
-        uint8_t Bits() const { return bits_; }
-
-        uint64_t Mask() const { return mask_; }
-
-      private:
-        uint64_t Encode(float value, size_t reserved) const {
-          const float *above = std::lower_bound(static_cast<const float*>(begin_) + reserved, end_, value);
-          if (above == begin_ + reserved) return reserved;
-          if (above == end_) return end_ - begin_ - 1;
-          return above - begin_ - (value - *(above - 1) < *above - value);
-        }
-
-        float *begin_;
-        const float *end_;
-        uint8_t bits_;
-        uint64_t mask_;
-    };
-
-  public:
-    static const ModelType kModelTypeAdd = kQuantAdd;
-
-    static void UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config);
-
-    static uint64_t Size(uint8_t order, const Config &config) {
-      uint64_t longest_table = (static_cast<uint64_t>(1) << static_cast<uint64_t>(config.prob_bits)) * sizeof(float);
-      uint64_t middle_table = (static_cast<uint64_t>(1) << static_cast<uint64_t>(config.backoff_bits)) * sizeof(float) + longest_table;
-      // unigrams are currently not quantized so no need for a table.
-      return (order - 2) * middle_table + longest_table + /* for the bit counts and alignment padding) */ 8;
-    }
-
-    static uint8_t MiddleBits(const Config &config) { return config.prob_bits + config.backoff_bits; }
-    static uint8_t LongestBits(const Config &config) { return config.prob_bits; }
-
-    class MiddlePointer {
-      public:
-        MiddlePointer(const SeparatelyQuantize &quant, unsigned char order_minus_2, const util::BitAddress &address) : bins_(quant.GetTables(order_minus_2)), address_(address) {}
-
-        MiddlePointer() : address_(NULL, 0) {}
-
-        bool Found() const { return address_.base != NULL; }
-
-        float Prob() const {
-          return ProbBins().Decode(util::ReadInt25(address_.base, address_.offset + BackoffBins().Bits(), ProbBins().Bits(), ProbBins().Mask()));
-        }
-
-        float Backoff() const {
-          return BackoffBins().Decode(util::ReadInt25(address_.base, address_.offset, BackoffBins().Bits(), BackoffBins().Mask()));
-        }
-
-        float Rest() const { return Prob(); }
-
-        void Write(float prob, float backoff) const {
-          util::WriteInt57(address_.base, address_.offset, ProbBins().Bits() + BackoffBins().Bits(),
-              (ProbBins().EncodeProb(prob) << BackoffBins().Bits()) | BackoffBins().EncodeBackoff(backoff));
-        }
-
-      private:
-        const Bins &ProbBins() const { return bins_[0]; }
-        const Bins &BackoffBins() const { return bins_[1]; }
-        const Bins *bins_;
-
-        util::BitAddress address_;
-    };
-
-    class LongestPointer {
-      public:
-        LongestPointer(const SeparatelyQuantize &quant, const util::BitAddress &address) : table_(&quant.LongestTable()), address_(address) {}
-
-        LongestPointer() : address_(NULL, 0) {}
-
-        bool Found() const { return address_.base != NULL; }
-
-        void Write(float prob) const {
-          util::WriteInt25(address_.base, address_.offset, table_->Bits(), table_->EncodeProb(prob));
-        }
-
-        float Prob() const {
-          return table_->Decode(util::ReadInt25(address_.base, address_.offset, table_->Bits(), table_->Mask()));
-        }
-
-      private:
-        const Bins *table_;
-        util::BitAddress address_;
-    };
-
-    SeparatelyQuantize() {}
-
-    void SetupMemory(void *start, unsigned char order, const Config &config);
-
-    static const bool kTrain = true;
-    // Assumes 0.0 is removed from backoff.
-    void Train(uint8_t order, std::vector<float> &prob, std::vector<float> &backoff);
-    // Train just probabilities (for longest order).
-    void TrainProb(uint8_t order, std::vector<float> &prob);
-
-    void FinishedLoading(const Config &config);
-
-    const Bins *GetTables(unsigned char order_minus_2) const { return tables_[order_minus_2]; }
-
-    const Bins &LongestTable() const { return longest_; }
-
-  private:
-    Bins tables_[KENLM_MAX_ORDER - 1][2];
-
-    Bins longest_;
-
-    uint8_t *actual_base_;
-
-    uint8_t prob_bits_, backoff_bits_;
-};
-
-} // namespace ngram
-} // namespace lm
-
-#endif // LM_QUANTIZE_H

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/query_main.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/query_main.cc b/ext/kenlm/lm/query_main.cc
deleted file mode 100644
index 0bd28f7..0000000
--- a/ext/kenlm/lm/query_main.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-#include "lm/ngram_query.hh"
-#include "util/getopt.hh"
-
-#ifdef WITH_NPLM
-#include "lm/wrappers/nplm.hh"
-#endif
-
-#include <stdlib.h>
-
-void Usage(const char *name) {
-  std::cerr <<
-    "KenLM was compiled with maximum order " << KENLM_MAX_ORDER << ".\n"
-    "Usage: " << name << " [-b] [-n] [-w] [-s] lm_file\n"
-    "-b: Do not buffer output.\n"
-    "-n: Do not wrap the input in <s> and </s>.\n"
-    "-v summary|sentence|word: Level of verbosity\n"
-    "-l lazy|populate|read|parallel: Load lazily, with populate, or malloc+read\n"
-    "The default loading method is populate on Linux and read on others.\n";
-  exit(1);
-}
-
-int main(int argc, char *argv[]) {
-  if (argc == 1 || (argc == 2 && !strcmp(argv[1], "--help")))
-    Usage(argv[0]);
-
-  lm::ngram::Config config;
-  bool sentence_context = true;
-  unsigned int verbosity = 2;
-  bool flush = false;
-
-  int opt;
-  while ((opt = getopt(argc, argv, "bnv:l:")) != -1) {
-    switch (opt) {
-      case 'b':
-        flush = true;
-        break;
-      case 'n':
-        sentence_context = false;
-        break;
-      case 'v':
-        if (!strcmp(optarg, "word") || !strcmp(optarg, "2")) {
-          verbosity = 2;
-        } else if (!strcmp(optarg, "sentence") || !strcmp(optarg, "1")) {
-          verbosity = 1;
-        } else if (!strcmp(optarg, "summary") || !strcmp(optarg, "0")) {
-          verbosity = 0;
-        } else {
-          Usage(argv[0]);
-        }
-        break;
-      case 'l':
-        if (!strcmp(optarg, "lazy")) {
-          config.load_method = util::LAZY;
-        } else if (!strcmp(optarg, "populate")) {
-          config.load_method = util::POPULATE_OR_READ;
-        } else if (!strcmp(optarg, "read")) {
-          config.load_method = util::READ;
-        } else if (!strcmp(optarg, "parallel")) {
-          config.load_method = util::PARALLEL_READ;
-        } else {
-          Usage(argv[0]);
-        }
-        break;
-      case 'h':
-      default:
-        Usage(argv[0]);
-    }
-  }
-  if (optind + 1 != argc)
-    Usage(argv[0]);
-  lm::ngram::QueryPrinter printer(1, verbosity >= 2, verbosity >= 1, true, flush);
-  const char *file = argv[optind];
-  try {
-    using namespace lm::ngram;
-    ModelType model_type;
-    if (RecognizeBinary(file, model_type)) {
-      switch(model_type) {
-        case PROBING:
-          Query<lm::ngram::ProbingModel>(file, config, sentence_context, printer);
-          break;
-        case REST_PROBING:
-          Query<lm::ngram::RestProbingModel>(file, config, sentence_context, printer);
-          break;
-        case TRIE:
-          Query<TrieModel>(file, config, sentence_context, printer);
-          break;
-        case QUANT_TRIE:
-          Query<QuantTrieModel>(file, config, sentence_context, printer);
-          break;
-        case ARRAY_TRIE:
-          Query<ArrayTrieModel>(file, config, sentence_context, printer);
-          break;
-        case QUANT_ARRAY_TRIE:
-          Query<QuantArrayTrieModel>(file, config, sentence_context, printer);
-          break;
-        default:
-          std::cerr << "Unrecognized kenlm model type " << model_type << std::endl;
-          abort();
-      }
-#ifdef WITH_NPLM
-    } else if (lm::np::Model::Recognize(file)) {
-      lm::np::Model model(file);
-      Query<lm::np::Model, lm::ngram::QueryPrinter>(model, sentence_context, printer);
-      Query<lm::np::Model, lm::ngram::QueryPrinter>(model, sentence_context, printer);
-#endif
-    } else {
-      Query<ProbingModel>(file, config, sentence_context, printer);
-    }
-    util::PrintUsage(std::cerr);
-  } catch (const std::exception &e) {
-    std::cerr << e.what() << std::endl;
-    return 1;
-  }
-  return 0;
-}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/read_arpa.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/read_arpa.cc b/ext/kenlm/lm/read_arpa.cc
deleted file mode 100644
index dc05a65..0000000
--- a/ext/kenlm/lm/read_arpa.cc
+++ /dev/null
@@ -1,161 +0,0 @@
-#include "lm/read_arpa.hh"
-
-#include "lm/blank.hh"
-#include "util/file.hh"
-
-#include <cmath>
-#include <cstdlib>
-#include <iostream>
-#include <sstream>
-#include <vector>
-
-#include <cctype>
-#include <cstring>
-#include <stdint.h>
-
-#ifdef WIN32
-#include <float.h>
-#endif
-
-namespace lm {
-
-// 1 for '\t', '\n', and ' '.  This is stricter than isspace.
-const bool kARPASpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-
-namespace {
-
-bool IsEntirelyWhiteSpace(const StringPiece &line) {
-  for (size_t i = 0; i < static_cast<size_t>(line.size()); ++i) {
-    if (!isspace(line.data()[i])) return false;
-  }
-  return true;
-}
-
-const char kBinaryMagic[] = "mmap lm http://kheafield.com/code";
-
-// strtoull isn't portable enough :-(
-uint64_t ReadCount(const std::string &from) {
-  std::stringstream stream(from);
-  uint64_t ret;
-  stream >> ret;
-  UTIL_THROW_IF(!stream, FormatLoadException, "Bad count " << from);
-  return ret;
-}
-
-} // namespace
-
-void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number) {
-  number.clear();
-  StringPiece line = in.ReadLine();
-  // In general, ARPA files can have arbitrary text before "\data\"
-  // But in KenLM, we require such lines to start with "#", so that
-  // we can do stricter error checking
-  while (IsEntirelyWhiteSpace(line) || starts_with(line, "#")) {
-    line = in.ReadLine();
-  }
-
-  if (line != "\\data\\") {
-    if ((line.size() >= 2) && (line.data()[0] == 0x1f) && (static_cast<unsigned char>(line.data()[1]) == 0x8b)) {
-      UTIL_THROW(FormatLoadException, "Looks like a gzip file.  If this is an ARPA file, pipe " << in.FileName() << " through zcat.  If this already in binary format, you need to decompress it because mmap doesn't work on top of gzip.");
-    }
-    if (static_cast<size_t>(line.size()) >= strlen(kBinaryMagic) && StringPiece(line.data(), strlen(kBinaryMagic)) == kBinaryMagic)
-      UTIL_THROW(FormatLoadException, "This looks like a binary file but got sent to the ARPA parser.  Did you compress the binary file or pass a binary file where only ARPA files are accepted?");
-    UTIL_THROW_IF(line.size() >= 4 && StringPiece(line.data(), 4) == "blmt", FormatLoadException, "This looks like an IRSTLM binary file.  Did you forget to pass --text yes to compile-lm?");
-    UTIL_THROW_IF(line == "iARPA", FormatLoadException, "This looks like an IRSTLM iARPA file.  You need an ARPA file.  Run\n  compile-lm --text yes " << in.FileName() << " " << in.FileName() << ".arpa\nfirst.");
-    UTIL_THROW(FormatLoadException, "first non-empty line was \"" << line << "\" not \\data\\.");
-  }
-  while (!IsEntirelyWhiteSpace(line = in.ReadLine())) {
-    if (line.size() < 6 || strncmp(line.data(), "ngram ", 6)) UTIL_THROW(FormatLoadException, "count line \"" << line << "\"doesn't begin with \"ngram \"");
-    // So strtol doesn't go off the end of line.
-    std::string remaining(line.data() + 6, line.size() - 6);
-    char *end_ptr;
-    unsigned int length = std::strtol(remaining.c_str(), &end_ptr, 10);
-    if ((end_ptr == remaining.c_str()) || (length - 1 != number.size())) UTIL_THROW(FormatLoadException, "ngram count lengths should be consecutive starting with 1: " << line);
-    if (*end_ptr != '=') UTIL_THROW(FormatLoadException, "Expected = immediately following the first number in the count line " << line);
-    ++end_ptr;
-    number.push_back(ReadCount(end_ptr));
-  }
-}
-
-void ReadNGramHeader(util::FilePiece &in, unsigned int length) {
-   StringPiece line;
-  while (IsEntirelyWhiteSpace(line = in.ReadLine())) {}
-  std::stringstream expected;
-  expected << '\\' << length << "-grams:";
-  if (line != expected.str()) UTIL_THROW(FormatLoadException, "Was expecting n-gram header " << expected.str() << " but got " << line << " instead");
-}
-
-void ReadBackoff(util::FilePiece &in, Prob &/*weights*/) {
-  switch (in.get()) {
-    case '\t':
-      {
-        float got = in.ReadFloat();
-        if (got != 0.0)
-          UTIL_THROW(FormatLoadException, "Non-zero backoff " << got << " provided for an n-gram that should have no backoff");
-      }
-      break;
-    case '\n':
-      break;
-    default:
-      UTIL_THROW(FormatLoadException, "Expected tab or newline for backoff");
-  }
-}
-
-void ReadBackoff(util::FilePiece &in, float &backoff) {
-  // Always make zero negative.
-  // Negative zero means that no (n+1)-gram has this n-gram as context.
-  // Therefore the hypothesis state can be shorter.  Of course, many n-grams
-  // are context for (n+1)-grams.  An algorithm in the data structure will go
-  // back and set the backoff to positive zero in these cases.
-  switch (in.get()) {
-    case '\t':
-      backoff = in.ReadFloat();
-      if (backoff == ngram::kExtensionBackoff) backoff = ngram::kNoExtensionBackoff;
-      {
-#if defined(WIN32) && !defined(__MINGW32__)
-		int float_class = _fpclass(backoff);
-        UTIL_THROW_IF(float_class == _FPCLASS_SNAN || float_class == _FPCLASS_QNAN || float_class == _FPCLASS_NINF || float_class == _FPCLASS_PINF, FormatLoadException, "Bad backoff " << backoff);
-#else
-        int float_class = std::fpclassify(backoff);
-        UTIL_THROW_IF(float_class == FP_NAN || float_class == FP_INFINITE, FormatLoadException, "Bad backoff " << backoff);
-#endif
-      }
-      UTIL_THROW_IF(in.get() != '\n', FormatLoadException, "Expected newline after backoff");
-      break;
-    case '\n':
-      backoff = ngram::kNoExtensionBackoff;
-      break;
-    default:
-      UTIL_THROW(FormatLoadException, "Expected tab or newline for backoff");
-  }
-}
-
-void ReadEnd(util::FilePiece &in) {
-  StringPiece line;
-  do {
-    line = in.ReadLine();
-  } while (IsEntirelyWhiteSpace(line));
-  if (line != "\\end\\") UTIL_THROW(FormatLoadException, "Expected \\end\\ but the ARPA file has " << line);
-
-  try {
-    while (true) {
-      line = in.ReadLine();
-      if (!IsEntirelyWhiteSpace(line)) UTIL_THROW(FormatLoadException, "Trailing line " << line);
-    }
-  } catch (const util::EndOfFileException &e) {}
-}
-
-void PositiveProbWarn::Warn(float prob) {
-  switch (action_) {
-    case THROW_UP:
-      UTIL_THROW(FormatLoadException, "Positive log probability " << prob << " in the model.  This is a bug in IRSTLM; you can set config.positive_log_probability = SILENT or pass -i to build_binary to substitute 0.0 for the log probability.  Error");
-    case COMPLAIN:
-      std::cerr << "There's a positive log probability " << prob << " in the APRA file, probably because of a bug in IRSTLM.  This and subsequent entires will be mapped to 0 log probability." << std::endl;
-      action_ = SILENT;
-      break;
-    case SILENT:
-      break;
-  }
-}
-
-} // namespace lm

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/read_arpa.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/read_arpa.hh b/ext/kenlm/lm/read_arpa.hh
deleted file mode 100644
index 64eeef3..0000000
--- a/ext/kenlm/lm/read_arpa.hh
+++ /dev/null
@@ -1,95 +0,0 @@
-#ifndef LM_READ_ARPA_H
-#define LM_READ_ARPA_H
-
-#include "lm/lm_exception.hh"
-#include "lm/word_index.hh"
-#include "lm/weights.hh"
-#include "util/file_piece.hh"
-
-#include <cstddef>
-#include <iosfwd>
-#include <vector>
-
-namespace lm {
-
-void ReadARPACounts(util::FilePiece &in, std::vector<uint64_t> &number);
-void ReadNGramHeader(util::FilePiece &in, unsigned int length);
-
-void ReadBackoff(util::FilePiece &in, Prob &weights);
-void ReadBackoff(util::FilePiece &in, float &backoff);
-inline void ReadBackoff(util::FilePiece &in, ProbBackoff &weights) {
-  ReadBackoff(in, weights.backoff);
-}
-inline void ReadBackoff(util::FilePiece &in, RestWeights &weights) {
-  ReadBackoff(in, weights.backoff);
-}
-
-void ReadEnd(util::FilePiece &in);
-
-extern const bool kARPASpaces[256];
-
-// Positive log probability warning.
-class PositiveProbWarn {
-  public:
-    PositiveProbWarn() : action_(THROW_UP) {}
-
-    explicit PositiveProbWarn(WarningAction action) : action_(action) {}
-
-    void Warn(float prob);
-
-  private:
-    WarningAction action_;
-};
-
-template <class Voc, class Weights> void Read1Gram(util::FilePiece &f, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) {
-  try {
-    float prob = f.ReadFloat();
-    if (prob > 0.0) {
-      warn.Warn(prob);
-      prob = 0.0;
-    }
-    UTIL_THROW_IF(f.get() != '\t', FormatLoadException, "Expected tab after probability");
-    WordIndex word = vocab.Insert(f.ReadDelimited(kARPASpaces));
-    Weights &w = unigrams[word];
-    w.prob = prob;
-    ReadBackoff(f, w);
-  } catch(util::Exception &e) {
-    e << " in the 1-gram at byte " << f.Offset();
-    throw;
-  }
-}
-
-template <class Voc, class Weights> void Read1Grams(util::FilePiece &f, std::size_t count, Voc &vocab, Weights *unigrams, PositiveProbWarn &warn) {
-  ReadNGramHeader(f, 1);
-  for (std::size_t i = 0; i < count; ++i) {
-    Read1Gram(f, vocab, unigrams, warn);
-  }
-  vocab.FinishedLoading(unigrams);
-}
-
-// Read ngram, write vocab ids to indices_out.
-template <class Voc, class Weights, class Iterator> void ReadNGram(util::FilePiece &f, const unsigned char n, const Voc &vocab, Iterator indices_out, Weights &weights, PositiveProbWarn &warn) {
-  try {
-    weights.prob = f.ReadFloat();
-    if (weights.prob > 0.0) {
-      warn.Warn(weights.prob);
-      weights.prob = 0.0;
-    }
-    for (unsigned char i = 0; i < n; ++i, ++indices_out) {
-      StringPiece word(f.ReadDelimited(kARPASpaces));
-      WordIndex index = vocab.Index(word);
-      *indices_out = index;
-      // Check for words mapped to <unk> that are not the string <unk>.
-      UTIL_THROW_IF(index == 0 /* mapped to <unk> */ && (word != StringPiece("<unk>", 5)) && (word != StringPiece("<UNK>", 5)),
-          FormatLoadException, "Word " << word << " was not seen in the unigrams (which are supposed to list the entire vocabulary) but appears");
-    }
-    ReadBackoff(f, weights);
-  } catch(util::Exception &e) {
-    e << " in the " << static_cast<unsigned int>(n) << "-gram at byte " << f.Offset();
-    throw;
-  }
-}
-
-} // namespace lm
-
-#endif // LM_READ_ARPA_H

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/return.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/return.hh b/ext/kenlm/lm/return.hh
deleted file mode 100644
index ee1f25e..0000000
--- a/ext/kenlm/lm/return.hh
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef LM_RETURN_H
-#define LM_RETURN_H
-
-#include <stdint.h>
-
-namespace lm {
-/* Structure returned by scoring routines. */
-struct FullScoreReturn {
-  // log10 probability
-  float prob;
-
-  /* The length of n-gram matched.  Do not use this for recombination.
-   * Consider a model containing only the following n-grams:
-   * -1 foo
-   * -3.14  bar
-   * -2.718 baz -5
-   * -6 foo bar
-   *
-   * If you score ``bar'' then ngram_length is 1 and recombination state is the
-   * empty string because bar has zero backoff and does not extend to the
-   * right.
-   * If you score ``foo'' then ngram_length is 1 and recombination state is
-   * ``foo''.
-   *
-   * Ideally, keep output states around and compare them.  Failing that,
-   * get out_state.ValidLength() and use that length for recombination.
-   */
-  unsigned char ngram_length;
-
-  /* Left extension information.  If independent_left is set, then prob is
-   * independent of words to the left (up to additional backoff).  Otherwise,
-   * extend_left indicates how to efficiently extend further to the left.
-   */
-  bool independent_left;
-  uint64_t extend_left; // Defined only if independent_left
-
-  // Rest cost for extension to the left.
-  float rest;
-};
-
-} // namespace lm
-#endif // LM_RETURN_H

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/search_hashed.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/search_hashed.cc b/ext/kenlm/lm/search_hashed.cc
deleted file mode 100644
index 7e63e00..0000000
--- a/ext/kenlm/lm/search_hashed.cc
+++ /dev/null
@@ -1,298 +0,0 @@
-#include "lm/search_hashed.hh"
-
-#include "lm/binary_format.hh"
-#include "lm/blank.hh"
-#include "lm/lm_exception.hh"
-#include "lm/model.hh"
-#include "lm/read_arpa.hh"
-#include "lm/value.hh"
-#include "lm/vocab.hh"
-
-#include "util/bit_packing.hh"
-#include "util/file_piece.hh"
-
-#include <string>
-
-namespace lm {
-namespace ngram {
-
-class ProbingModel;
-
-namespace {
-
-/* These are passed to ReadNGrams so that n-grams with zero backoff that appear as context will still be used in state. */
-template <class Middle> class ActivateLowerMiddle {
-  public:
-    explicit ActivateLowerMiddle(Middle &middle) : modify_(middle) {}
-
-    void operator()(const WordIndex *vocab_ids, const unsigned int n) {
-      uint64_t hash = static_cast<WordIndex>(vocab_ids[1]);
-      for (const WordIndex *i = vocab_ids + 2; i < vocab_ids + n; ++i) {
-        hash = detail::CombineWordHash(hash, *i);
-      }
-      typename Middle::MutableIterator i;
-      // TODO: somehow get text of n-gram for this error message.
-      if (!modify_.UnsafeMutableFind(hash, i))
-        UTIL_THROW(FormatLoadException, "The context of every " << n << "-gram should appear as a " << (n-1) << "-gram");
-      SetExtension(i->value.backoff);
-    }
-
-  private:
-    Middle &modify_;
-};
-
-template <class Weights> class ActivateUnigram {
-  public:
-    explicit ActivateUnigram(Weights *unigram) : modify_(unigram) {}
-
-    void operator()(const WordIndex *vocab_ids, const unsigned int /*n*/) {
-      // assert(n == 2);
-      SetExtension(modify_[vocab_ids[1]].backoff);
-    }
-
-  private:
-    Weights *modify_;
-};
-
-// Find the lower order entry, inserting blanks along the way as necessary.
-template <class Value> void FindLower(
-    const std::vector<uint64_t> &keys,
-    typename Value::Weights &unigram,
-    std::vector<util::ProbingHashTable<typename Value::ProbingEntry, util::IdentityHash> > &middle,
-    std::vector<typename Value::Weights *> &between) {
-  typename util::ProbingHashTable<typename Value::ProbingEntry, util::IdentityHash>::MutableIterator iter;
-  typename Value::ProbingEntry entry;
-  // Backoff will always be 0.0.  We'll get the probability and rest in another pass.
-  entry.value.backoff = kNoExtensionBackoff;
-  // Go back and find the longest right-aligned entry, informing it that it extends left.  Normally this will match immediately, but sometimes SRI is dumb.
-  for (int lower = keys.size() - 2; ; --lower) {
-    if (lower == -1) {
-      between.push_back(&unigram);
-      return;
-    }
-    entry.key = keys[lower];
-    bool found = middle[lower].FindOrInsert(entry, iter);
-    between.push_back(&iter->value);
-    if (found) return;
-  }
-}
-
-// Between usually has  single entry, the value to adjust.  But sometimes SRI stupidly pruned entries so it has unitialized blank values to be set here.
-template <class Added, class Build> void AdjustLower(
-    const Added &added,
-    const Build &build,
-    std::vector<typename Build::Value::Weights *> &between,
-    const unsigned int n,
-    const std::vector<WordIndex> &vocab_ids,
-    typename Build::Value::Weights *unigrams,
-    std::vector<util::ProbingHashTable<typename Build::Value::ProbingEntry, util::IdentityHash> > &middle) {
-  typedef typename Build::Value Value;
-  if (between.size() == 1) {
-    build.MarkExtends(*between.front(), added);
-    return;
-  }
-  typedef util::ProbingHashTable<typename Value::ProbingEntry, util::IdentityHash> Middle;
-  float prob = -fabs(between.back()->prob);
-  // Order of the n-gram on which probabilities are based.
-  unsigned char basis = n - between.size();
-  assert(basis != 0);
-  typename Build::Value::Weights **change = &between.back();
-  // Skip the basis.
-  --change;
-  if (basis == 1) {
-    // Hallucinate a bigram based on a unigram's backoff and a unigram probability.
-    float &backoff = unigrams[vocab_ids[1]].backoff;
-    SetExtension(backoff);
-    prob += backoff;
-    (*change)->prob = prob;
-    build.SetRest(&*vocab_ids.begin(), 2, **change);
-    basis = 2;
-    --change;
-  }
-  uint64_t backoff_hash = static_cast<uint64_t>(vocab_ids[1]);
-  for (unsigned char i = 2; i <= basis; ++i) {
-    backoff_hash = detail::CombineWordHash(backoff_hash, vocab_ids[i]);
-  }
-  for (; basis < n - 1; ++basis, --change) {
-    typename Middle::MutableIterator gotit;
-    if (middle[basis - 2].UnsafeMutableFind(backoff_hash, gotit)) {
-      float &backoff = gotit->value.backoff;
-      SetExtension(backoff);
-      prob += backoff;
-    }
-    (*change)->prob = prob;
-    build.SetRest(&*vocab_ids.begin(), basis + 1, **change);
-    backoff_hash = detail::CombineWordHash(backoff_hash, vocab_ids[basis+1]);
-  }
-
-  typename std::vector<typename Value::Weights *>::const_iterator i(between.begin());
-  build.MarkExtends(**i, added);
-  const typename Value::Weights *longer = *i;
-  // Everything has probability but is not marked as extending.
-  for (++i; i != between.end(); ++i) {
-    build.MarkExtends(**i, *longer);
-    longer = *i;
-  }
-}
-
-// Continue marking lower entries even they know that they extend left.  This is used for upper/lower bounds.
-template <class Build> void MarkLower(
-    const std::vector<uint64_t> &keys,
-    const Build &build,
-    typename Build::Value::Weights &unigram,
-    std::vector<util::ProbingHashTable<typename Build::Value::ProbingEntry, util::IdentityHash> > &middle,
-    int start_order,
-    const typename Build::Value::Weights &longer) {
-  if (start_order == 0) return;
-  // Hopefully the compiler will realize that if MarkExtends always returns false, it can simplify this code.
-  for (int even_lower = start_order - 2 /* index in middle */; ; --even_lower) {
-    if (even_lower == -1) {
-      build.MarkExtends(unigram, longer);
-      return;
-    }
-    if (!build.MarkExtends(
-          middle[even_lower].UnsafeMutableMustFind(keys[even_lower])->value,
-          longer)) return;
-  }
-}
-
-template <class Build, class Activate, class Store> void ReadNGrams(
-    util::FilePiece &f,
-    const unsigned int n,
-    const size_t count,
-    const ProbingVocabulary &vocab,
-    const Build &build,
-    typename Build::Value::Weights *unigrams,
-    std::vector<util::ProbingHashTable<typename Build::Value::ProbingEntry, util::IdentityHash> > &middle,
-    Activate activate,
-    Store &store,
-    PositiveProbWarn &warn) {
-  typedef typename Build::Value Value;
-  assert(n >= 2);
-  ReadNGramHeader(f, n);
-
-  // Both vocab_ids and keys are non-empty because n >= 2.
-  // vocab ids of words in reverse order.
-  std::vector<WordIndex> vocab_ids(n);
-  std::vector<uint64_t> keys(n-1);
-  typename Store::Entry entry;
-  std::vector<typename Value::Weights *> between;
-  for (size_t i = 0; i < count; ++i) {
-    ReadNGram(f, n, vocab, vocab_ids.rbegin(), entry.value, warn);
-    build.SetRest(&*vocab_ids.begin(), n, entry.value);
-
-    keys[0] = detail::CombineWordHash(static_cast<uint64_t>(vocab_ids.front()), vocab_ids[1]);
-    for (unsigned int h = 1; h < n - 1; ++h) {
-      keys[h] = detail::CombineWordHash(keys[h-1], vocab_ids[h+1]);
-    }
-    // Initially the sign bit is on, indicating it does not extend left.  Most already have this but there might +0.0.
-    util::SetSign(entry.value.prob);
-    entry.key = keys[n-2];
-
-    store.Insert(entry);
-    between.clear();
-    FindLower<Value>(keys, unigrams[vocab_ids.front()], middle, between);
-    AdjustLower<typename Store::Entry::Value, Build>(entry.value, build, between, n, vocab_ids, unigrams, middle);
-    if (Build::kMarkEvenLower) MarkLower<Build>(keys, build, unigrams[vocab_ids.front()], middle, n - between.size() - 1, *between.back());
-    activate(&*vocab_ids.begin(), n);
-  }
-
-  store.FinishedInserting();
-}
-
-} // namespace
-namespace detail {
-
-template <class Value> uint8_t *HashedSearch<Value>::SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config) {
-  unigram_ = Unigram(start, counts[0]);
-  start += Unigram::Size(counts[0]);
-  std::size_t allocated;
-  middle_.clear();
-  for (unsigned int n = 2; n < counts.size(); ++n) {
-    allocated = Middle::Size(counts[n - 1], config.probing_multiplier);
-    middle_.push_back(Middle(start, allocated));
-    start += allocated;
-  }
-  allocated = Longest::Size(counts.back(), config.probing_multiplier);
-  longest_ = Longest(start, allocated);
-  start += allocated;
-  return start;
-}
-
-/*template <class Value> void HashedSearch<Value>::Relocate(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config) {
-  unigram_ = Unigram(start, counts[0]);
-  start += Unigram::Size(counts[0]);
-  for (unsigned int n = 2; n < counts.size(); ++n) {
-    middle[n-2].Relocate(start);
-    start += Middle::Size(counts[n - 1], config.probing_multiplier)
-  }
-  longest_.Relocate(start);
-}*/
-
-template <class Value> void HashedSearch<Value>::InitializeFromARPA(const char * /*file*/, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, ProbingVocabulary &vocab, BinaryFormat &backing) {
-  void *vocab_rebase;
-  void *search_base = backing.GrowForSearch(Size(counts, config), vocab.UnkCountChangePadding(), vocab_rebase);
-  vocab.Relocate(vocab_rebase);
-  SetupMemory(reinterpret_cast<uint8_t*>(search_base), counts, config);
-
-  PositiveProbWarn warn(config.positive_log_probability);
-  Read1Grams(f, counts[0], vocab, unigram_.Raw(), warn);
-  CheckSpecials(config, vocab);
-  DispatchBuild(f, counts, config, vocab, warn);
-}
-
-template <> void HashedSearch<BackoffValue>::DispatchBuild(util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, const ProbingVocabulary &vocab, PositiveProbWarn &warn) {
-  NoRestBuild build;
-  ApplyBuild(f, counts, vocab, warn, build);
-}
-
-template <> void HashedSearch<RestValue>::DispatchBuild(util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, const ProbingVocabulary &vocab, PositiveProbWarn &warn) {
-  switch (config.rest_function) {
-    case Config::REST_MAX:
-      {
-        MaxRestBuild build;
-        ApplyBuild(f, counts, vocab, warn, build);
-      }
-      break;
-    case Config::REST_LOWER:
-      {
-        LowerRestBuild<ProbingModel> build(config, counts.size(), vocab);
-        ApplyBuild(f, counts, vocab, warn, build);
-      }
-      break;
-  }
-}
-
-template <class Value> template <class Build> void HashedSearch<Value>::ApplyBuild(util::FilePiece &f, const std::vector<uint64_t> &counts, const ProbingVocabulary &vocab, PositiveProbWarn &warn, const Build &build) {
-  for (WordIndex i = 0; i < counts[0]; ++i) {
-    build.SetRest(&i, (unsigned int)1, unigram_.Raw()[i]);
-  }
-
-  try {
-    if (counts.size() > 2) {
-      ReadNGrams<Build, ActivateUnigram<typename Value::Weights>, Middle>(
-          f, 2, counts[1], vocab, build, unigram_.Raw(), middle_, ActivateUnigram<typename Value::Weights>(unigram_.Raw()), middle_[0], warn);
-    }
-    for (unsigned int n = 3; n < counts.size(); ++n) {
-      ReadNGrams<Build, ActivateLowerMiddle<Middle>, Middle>(
-          f, n, counts[n-1], vocab, build, unigram_.Raw(), middle_, ActivateLowerMiddle<Middle>(middle_[n-3]), middle_[n-2], warn);
-    }
-    if (counts.size() > 2) {
-      ReadNGrams<Build, ActivateLowerMiddle<Middle>, Longest>(
-          f, counts.size(), counts[counts.size() - 1], vocab, build, unigram_.Raw(), middle_, ActivateLowerMiddle<Middle>(middle_.back()), longest_, warn);
-    } else {
-      ReadNGrams<Build, ActivateUnigram<typename Value::Weights>, Longest>(
-          f, counts.size(), counts[counts.size() - 1], vocab, build, unigram_.Raw(), middle_, ActivateUnigram<typename Value::Weights>(unigram_.Raw()), longest_, warn);
-    }
-  } catch (util::ProbingSizeException &e) {
-    UTIL_THROW(util::ProbingSizeException, "Avoid pruning n-grams like \"bar baz quux\" when \"foo bar baz quux\" is still in the model.  KenLM will work when this pruning happens, but the probing model assumes these events are rare enough that using blank space in the probing hash table will cover all of them.  Increase probing_multiplier (-p to build_binary) to add more blank spaces.\n");
-  }
-  ReadEnd(f);
-}
-
-template class HashedSearch<BackoffValue>;
-template class HashedSearch<RestValue>;
-
-} // namespace detail
-} // namespace ngram
-} // namespace lm

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/search_hashed.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/search_hashed.hh b/ext/kenlm/lm/search_hashed.hh
deleted file mode 100644
index 9dc8445..0000000
--- a/ext/kenlm/lm/search_hashed.hh
+++ /dev/null
@@ -1,192 +0,0 @@
-#ifndef LM_SEARCH_HASHED_H
-#define LM_SEARCH_HASHED_H
-
-#include "lm/model_type.hh"
-#include "lm/config.hh"
-#include "lm/read_arpa.hh"
-#include "lm/return.hh"
-#include "lm/weights.hh"
-
-#include "util/bit_packing.hh"
-#include "util/probing_hash_table.hh"
-
-#include <algorithm>
-#include <iostream>
-#include <vector>
-
-namespace util { class FilePiece; }
-
-namespace lm {
-namespace ngram {
-class BinaryFormat;
-class ProbingVocabulary;
-namespace detail {
-
-inline uint64_t CombineWordHash(uint64_t current, const WordIndex next) {
-  uint64_t ret = (current * 8978948897894561157ULL) ^ (static_cast<uint64_t>(1 + next) * 17894857484156487943ULL);
-  return ret;
-}
-
-#pragma pack(push)
-#pragma pack(4)
-struct ProbEntry {
-  uint64_t key;
-  Prob value;
-  typedef uint64_t Key;
-  typedef Prob Value;
-  uint64_t GetKey() const {
-    return key;
-  }
-};
-
-#pragma pack(pop)
-
-class LongestPointer {
-  public:
-    explicit LongestPointer(const float &to) : to_(&to) {}
-
-    LongestPointer() : to_(NULL) {}
-
-    bool Found() const {
-      return to_ != NULL;
-    }
-
-    float Prob() const {
-      return *to_;
-    }
-
-  private:
-    const float *to_;
-};
-
-template <class Value> class HashedSearch {
-  public:
-    typedef uint64_t Node;
-
-    typedef typename Value::ProbingProxy UnigramPointer;
-    typedef typename Value::ProbingProxy MiddlePointer;
-    typedef ::lm::ngram::detail::LongestPointer LongestPointer;
-
-    static const ModelType kModelType = Value::kProbingModelType;
-    static const bool kDifferentRest = Value::kDifferentRest;
-    static const unsigned int kVersion = 0;
-
-    // TODO: move probing_multiplier here with next binary file format update.
-    static void UpdateConfigFromBinary(const BinaryFormat &, const std::vector<uint64_t> &, uint64_t, Config &) {}
-
-    static uint64_t Size(const std::vector<uint64_t> &counts, const Config &config) {
-      uint64_t ret = Unigram::Size(counts[0]);
-      for (unsigned char n = 1; n < counts.size() - 1; ++n) {
-        ret += Middle::Size(counts[n], config.probing_multiplier);
-      }
-      return ret + Longest::Size(counts.back(), config.probing_multiplier);
-    }
-
-    uint8_t *SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config);
-
-    void InitializeFromARPA(const char *file, util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, ProbingVocabulary &vocab, BinaryFormat &backing);
-
-    unsigned char Order() const {
-      return middle_.size() + 2;
-    }
-
-    typename Value::Weights &UnknownUnigram() { return unigram_.Unknown(); }
-
-    UnigramPointer LookupUnigram(WordIndex word, Node &next, bool &independent_left, uint64_t &extend_left) const {
-      extend_left = static_cast<uint64_t>(word);
-      next = extend_left;
-      UnigramPointer ret(unigram_.Lookup(word));
-      independent_left = ret.IndependentLeft();
-      return ret;
-    }
-
-    MiddlePointer Unpack(uint64_t extend_pointer, unsigned char extend_length, Node &node) const {
-      node = extend_pointer;
-      return MiddlePointer(middle_[extend_length - 2].MustFind(extend_pointer)->value);
-    }
-
-    MiddlePointer LookupMiddle(unsigned char order_minus_2, WordIndex word, Node &node, bool &independent_left, uint64_t &extend_pointer) const {
-      node = CombineWordHash(node, word);
-      typename Middle::ConstIterator found;
-      if (!middle_[order_minus_2].Find(node, found)) {
-        independent_left = true;
-        return MiddlePointer();
-      }
-      extend_pointer = node;
-      MiddlePointer ret(found->value);
-      independent_left = ret.IndependentLeft();
-      return ret;
-    }
-
-    LongestPointer LookupLongest(WordIndex word, const Node &node) const {
-      // Sign bit is always on because longest n-grams do not extend left.
-      typename Longest::ConstIterator found;
-      if (!longest_.Find(CombineWordHash(node, word), found)) return LongestPointer();
-      return LongestPointer(found->value.prob);
-    }
-
-    // Generate a node without necessarily checking that it actually exists.
-    // Optionally return false if it's know to not exist.
-    bool FastMakeNode(const WordIndex *begin, const WordIndex *end, Node &node) const {
-      assert(begin != end);
-      node = static_cast<Node>(*begin);
-      for (const WordIndex *i = begin + 1; i < end; ++i) {
-        node = CombineWordHash(node, *i);
-      }
-      return true;
-    }
-
-  private:
-    // Interpret config's rest cost build policy and pass the right template argument to ApplyBuild.
-    void DispatchBuild(util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, const ProbingVocabulary &vocab, PositiveProbWarn &warn);
-
-    template <class Build> void ApplyBuild(util::FilePiece &f, const std::vector<uint64_t> &counts, const ProbingVocabulary &vocab, PositiveProbWarn &warn, const Build &build);
-
-    class Unigram {
-      public:
-        Unigram() {}
-
-        Unigram(void *start, uint64_t count) :
-          unigram_(static_cast<typename Value::Weights*>(start))
-#ifdef DEBUG
-         ,  count_(count)
-#endif
-      {}
-
-        static uint64_t Size(uint64_t count) {
-          return (count + 1) * sizeof(typename Value::Weights); // +1 for hallucinate <unk>
-        }
-
-        const typename Value::Weights &Lookup(WordIndex index) const {
-#ifdef DEBUG
-          assert(index < count_);
-#endif
-          return unigram_[index];
-        }
-
-        typename Value::Weights &Unknown() { return unigram_[0]; }
-
-        // For building.
-        typename Value::Weights *Raw() { return unigram_; }
-
-      private:
-        typename Value::Weights *unigram_;
-#ifdef DEBUG
-        uint64_t count_;
-#endif
-    };
-
-    Unigram unigram_;
-
-    typedef util::ProbingHashTable<typename Value::ProbingEntry, util::IdentityHash> Middle;
-    std::vector<Middle> middle_;
-
-    typedef util::ProbingHashTable<ProbEntry, util::IdentityHash> Longest;
-    Longest longest_;
-};
-
-} // namespace detail
-} // namespace ngram
-} // namespace lm
-
-#endif // LM_SEARCH_HASHED_H