You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/19 21:33:55 UTC
[07/51] [partial] incubator-joshua git commit: Converted KenLM into a submodule

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/vocab.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/vocab.cc b/ext/kenlm/lm/vocab.cc
deleted file mode 100644
index 3d83e04..0000000
--- a/ext/kenlm/lm/vocab.cc
+++ /dev/null
@@ -1,329 +0,0 @@
-#include "lm/vocab.hh"
-
-#include "lm/binary_format.hh"
-#include "lm/enumerate_vocab.hh"
-#include "lm/lm_exception.hh"
-#include "lm/config.hh"
-#include "lm/weights.hh"
-#include "util/exception.hh"
-#include "util/file_stream.hh"
-#include "util/file.hh"
-#include "util/joint_sort.hh"
-#include "util/murmur_hash.hh"
-#include "util/probing_hash_table.hh"
-
-#include <cstring>
-#include <string>
-
-namespace lm {
-namespace ngram {
-
-namespace detail {
-uint64_t HashForVocab(const char *str, std::size_t len) {
-  // This proved faster than Boost's hash in speed trials: total load time Murmur 67090000, Boost 72210000
-  // Chose to use 64A instead of native so binary format will be portable across 64 and 32 bit.
-  return util::MurmurHash64A(str, len, 0);
-}
-} // namespace detail
-
-namespace {
-// Normally static initialization is a bad idea but MurmurHash is pure arithmetic, so this is ok.
-const uint64_t kUnknownHash = detail::HashForVocab("<unk>", 5);
-// Sadly some LMs have <UNK>.
-const uint64_t kUnknownCapHash = detail::HashForVocab("<UNK>", 5);
-
-// TODO: replace with FilePiece.
-void ReadWords(int fd, EnumerateVocab *enumerate, WordIndex expected_count, uint64_t offset) {
-  util::SeekOrThrow(fd, offset);
-  // Check that we're at the right place by reading <unk> which is always first.
-  char check_unk[6];
-  util::ReadOrThrow(fd, check_unk, 6);
-  UTIL_THROW_IF(
-      memcmp(check_unk, "<unk>", 6),
-      FormatLoadException,
-      "Vocabulary words are in the wrong place.  This could be because the binary file was built with stale gcc and old kenlm.  Stale gcc, including the gcc distributed with RedHat and OS X, has a bug that ignores pragma pack for template-dependent types.  New kenlm works around this, so you'll save memory but have to rebuild any binary files using the probing data structure.");
-  if (!enumerate) return;
-  enumerate->Add(0, "<unk>");
-
-  // Read all the words after unk.
-  const std::size_t kInitialRead = 16384;
-  std::string buf;
-  buf.reserve(kInitialRead + 100);
-  buf.resize(kInitialRead);
-  WordIndex index = 1; // Read <unk> already.
-  while (true) {
-    std::size_t got = util::ReadOrEOF(fd, &buf[0], kInitialRead);
-    if (got == 0) break;
-    buf.resize(got);
-    while (buf[buf.size() - 1]) {
-      char next_char;
-      util::ReadOrThrow(fd, &next_char, 1);
-      buf.push_back(next_char);
-    }
-    // Ok now we have null terminated strings.
-    for (const char *i = buf.data(); i != buf.data() + buf.size();) {
-      std::size_t length = strlen(i);
-      enumerate->Add(index++, StringPiece(i, length));
-      i += length + 1 /* null byte */;
-    }
-  }
-
-  UTIL_THROW_IF(expected_count != index, FormatLoadException, "The binary file has the wrong number of words at the end.  This could be caused by a truncated binary file.");
-}
-
-// Constructor ordering madness.
-int SeekAndReturn(int fd, uint64_t start) {
-  util::SeekOrThrow(fd, start);
-  return fd;
-}
-} // namespace
-
-ImmediateWriteWordsWrapper::ImmediateWriteWordsWrapper(EnumerateVocab *inner, int fd, uint64_t start)
-  : inner_(inner), stream_(SeekAndReturn(fd, start)) {}
-
-WriteWordsWrapper::WriteWordsWrapper(EnumerateVocab *inner) : inner_(inner) {}
-
-void WriteWordsWrapper::Add(WordIndex index, const StringPiece &str) {
-  if (inner_) inner_->Add(index, str);
-  buffer_.append(str.data(), str.size());
-  buffer_.push_back(0);
-}
-
-void WriteWordsWrapper::Write(int fd, uint64_t start) {
-  util::SeekOrThrow(fd, start);
-  util::WriteOrThrow(fd, buffer_.data(), buffer_.size());
-  // Free memory from the string.
-  std::string for_swap;
-  std::swap(buffer_, for_swap);
-}
-
-SortedVocabulary::SortedVocabulary() : begin_(NULL), end_(NULL), enumerate_(NULL) {}
-
-uint64_t SortedVocabulary::Size(uint64_t entries, const Config &/*config*/) {
-  // Lead with the number of entries.
-  return sizeof(uint64_t) + sizeof(uint64_t) * entries;
-}
-
-void SortedVocabulary::SetupMemory(void *start, std::size_t allocated, std::size_t entries, const Config &config) {
-  assert(allocated >= Size(entries, config));
-  // Leave space for number of entries.
-  begin_ = reinterpret_cast<uint64_t*>(start) + 1;
-  end_ = begin_;
-  saw_unk_ = false;
-}
-
-void SortedVocabulary::Relocate(void *new_start) {
-  std::size_t delta = end_ - begin_;
-  begin_ = reinterpret_cast<uint64_t*>(new_start) + 1;
-  end_ = begin_ + delta;
-}
-
-void SortedVocabulary::ConfigureEnumerate(EnumerateVocab *to, std::size_t max_entries) {
-  enumerate_ = to;
-  if (enumerate_) {
-    enumerate_->Add(0, "<unk>");
-    strings_to_enumerate_.resize(max_entries);
-  }
-}
-
-WordIndex SortedVocabulary::Insert(const StringPiece &str) {
-  uint64_t hashed = detail::HashForVocab(str);
-  if (hashed == kUnknownHash || hashed == kUnknownCapHash) {
-    saw_unk_ = true;
-    return 0;
-  }
-  *end_ = hashed;
-  if (enumerate_) {
-    void *copied = string_backing_.Allocate(str.size());
-    memcpy(copied, str.data(), str.size());
-    strings_to_enumerate_[end_ - begin_] = StringPiece(static_cast<const char*>(copied), str.size());
-  }
-  ++end_;
-  // This is 1 + the offset where it was inserted to make room for unk.
-  return end_ - begin_;
-}
-
-void SortedVocabulary::FinishedLoading(ProbBackoff *reorder) {
-  GenericFinished(reorder);
-}
-
-namespace {
-#pragma pack(push)
-#pragma pack(4)
-struct RenumberEntry {
-  uint64_t hash;
-  const char *str;
-  WordIndex old;
-  bool operator<(const RenumberEntry &other) const {
-    return hash < other.hash;
-  }
-};
-#pragma pack(pop)
-} // namespace
-
-void SortedVocabulary::ComputeRenumbering(WordIndex types, int from_words, int to_words, std::vector<WordIndex> &mapping) {
-  mapping.clear();
-  uint64_t file_size = util::SizeOrThrow(from_words);
-  util::scoped_memory strings;
-  util::MapRead(util::POPULATE_OR_READ, from_words, 0, file_size, strings);
-  const char *const start = static_cast<const char*>(strings.get());
-  UTIL_THROW_IF(memcmp(start, "<unk>", 6), FormatLoadException, "Vocab file does not begin with <unk> followed by null");
-  std::vector<RenumberEntry> entries;
-  entries.reserve(types - 1);
-  RenumberEntry entry;
-  entry.old = 1;
-  for (entry.str = start + 6 /* skip <unk>\0 */; entry.str < start + file_size; ++entry.old) {
-    StringPiece str(entry.str, strlen(entry.str));
-    entry.hash = detail::HashForVocab(str);
-    entries.push_back(entry);
-    entry.str += str.size() + 1;
-  }
-  UTIL_THROW_IF2(entries.size() != types - 1, "Wrong number of vocab ids.  Got " << (entries.size() + 1) << " expected " << types);
-  std::sort(entries.begin(), entries.end());
-  // Write out new vocab file.
-  {
-    util::FileStream out(to_words);
-    out << "<unk>" << '\0';
-    for (std::vector<RenumberEntry>::const_iterator i = entries.begin(); i != entries.end(); ++i) {
-      out << i->str << '\0';
-    }
-  }
-  strings.reset();
-
-  mapping.resize(types);
-  mapping[0] = 0; // <unk>
-  for (std::vector<RenumberEntry>::const_iterator i = entries.begin(); i != entries.end(); ++i) {
-    mapping[i->old] = i + 1 - entries.begin();
-  }
-}
-
-void SortedVocabulary::Populated() {
-  saw_unk_ = true;
-  SetSpecial(Index("<s>"), Index("</s>"), 0);
-  bound_ = end_ - begin_ + 1;
-  *(reinterpret_cast<uint64_t*>(begin_) - 1) = end_ - begin_;
-}
-
-void SortedVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to, uint64_t offset) {
-  end_ = begin_ + *(reinterpret_cast<const uint64_t*>(begin_) - 1);
-  SetSpecial(Index("<s>"), Index("</s>"), 0);
-  bound_ = end_ - begin_ + 1;
-  if (have_words) ReadWords(fd, to, bound_, offset);
-}
-
-template <class T> void SortedVocabulary::GenericFinished(T *reorder) {
-  if (enumerate_) {
-    if (!strings_to_enumerate_.empty()) {
-      util::PairedIterator<T*, StringPiece*> values(reorder + 1, &*strings_to_enumerate_.begin());
-      util::JointSort(begin_, end_, values);
-    }
-    for (WordIndex i = 0; i < static_cast<WordIndex>(end_ - begin_); ++i) {
-      // <unk> strikes again: +1 here.
-      enumerate_->Add(i + 1, strings_to_enumerate_[i]);
-    }
-    strings_to_enumerate_.clear();
-    string_backing_.FreeAll();
-  } else {
-    util::JointSort(begin_, end_, reorder + 1);
-  }
-  SetSpecial(Index("<s>"), Index("</s>"), 0);
-  // Save size.  Excludes UNK.
-  *(reinterpret_cast<uint64_t*>(begin_) - 1) = end_ - begin_;
-  // Includes UNK.
-  bound_ = end_ - begin_ + 1;
-}
-
-namespace {
-const unsigned int kProbingVocabularyVersion = 0;
-} // namespace
-
-namespace detail {
-struct ProbingVocabularyHeader {
-  // Lowest unused vocab id.  This is also the number of words, including <unk>.
-  unsigned int version;
-  WordIndex bound;
-};
-} // namespace detail
-
-ProbingVocabulary::ProbingVocabulary() : enumerate_(NULL) {}
-
-uint64_t ProbingVocabulary::Size(uint64_t entries, float probing_multiplier) {
-  return ALIGN8(sizeof(detail::ProbingVocabularyHeader)) + Lookup::Size(entries, probing_multiplier);
-}
-
-uint64_t ProbingVocabulary::Size(uint64_t entries, const Config &config) {
-  return Size(entries, config.probing_multiplier);
-}
-
-void ProbingVocabulary::SetupMemory(void *start, std::size_t allocated) {
-  header_ = static_cast<detail::ProbingVocabularyHeader*>(start);
-  lookup_ = Lookup(static_cast<uint8_t*>(start) + ALIGN8(sizeof(detail::ProbingVocabularyHeader)), allocated);
-  bound_ = 1;
-  saw_unk_ = false;
-}
-
-void ProbingVocabulary::Relocate(void *new_start) {
-  header_ = static_cast<detail::ProbingVocabularyHeader*>(new_start);
-  lookup_.Relocate(static_cast<uint8_t*>(new_start) + ALIGN8(sizeof(detail::ProbingVocabularyHeader)));
-}
-
-void ProbingVocabulary::ConfigureEnumerate(EnumerateVocab *to, std::size_t /*max_entries*/) {
-  enumerate_ = to;
-  if (enumerate_) {
-    enumerate_->Add(0, "<unk>");
-  }
-}
-
-WordIndex ProbingVocabulary::Insert(const StringPiece &str) {
-  uint64_t hashed = detail::HashForVocab(str);
-  // Prevent unknown from going into the table.
-  if (hashed == kUnknownHash || hashed == kUnknownCapHash) {
-    saw_unk_ = true;
-    return 0;
-  } else {
-    if (enumerate_) enumerate_->Add(bound_, str);
-    lookup_.Insert(ProbingVocabularyEntry::Make(hashed, bound_));
-    return bound_++;
-  }
-}
-
-void ProbingVocabulary::InternalFinishedLoading() {
-  lookup_.FinishedInserting();
-  header_->bound = bound_;
-  header_->version = kProbingVocabularyVersion;
-  SetSpecial(Index("<s>"), Index("</s>"), 0);
-}
-
-void ProbingVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to, uint64_t offset) {
-  UTIL_THROW_IF(header_->version != kProbingVocabularyVersion, FormatLoadException, "The binary file has probing version " << header_->version << " but the code expects version " << kProbingVocabularyVersion << ".  Please rerun build_binary using the same version of the code.");
-  bound_ = header_->bound;
-  SetSpecial(Index("<s>"), Index("</s>"), 0);
-  if (have_words) ReadWords(fd, to, bound_, offset);
-}
-
-void MissingUnknown(const Config &config) throw(SpecialWordMissingException) {
-  switch(config.unknown_missing) {
-    case SILENT:
-      return;
-    case COMPLAIN:
-      if (config.messages) *config.messages << "The ARPA file is missing <unk>.  Substituting log10 probability " << config.unknown_missing_logprob << "." << std::endl;
-      break;
-    case THROW_UP:
-      UTIL_THROW(SpecialWordMissingException, "The ARPA file is missing <unk> and the model is configured to throw an exception.");
-  }
-}
-
-void MissingSentenceMarker(const Config &config, const char *str) throw(SpecialWordMissingException) {
-  switch (config.sentence_marker_missing) {
-    case SILENT:
-      return;
-    case COMPLAIN:
-      if (config.messages) *config.messages << "Missing special word " << str << "; will treat it as <unk>.";
-      break;
-    case THROW_UP:
-      UTIL_THROW(SpecialWordMissingException, "The ARPA file is missing " << str << " and the model is configured to reject these models.  Run build_binary -s to disable this check.");
-  }
-}
-
-} // namespace ngram
-} // namespace lm

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/vocab.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/vocab.hh b/ext/kenlm/lm/vocab.hh
deleted file mode 100644
index 59740e8..0000000
--- a/ext/kenlm/lm/vocab.hh
+++ /dev/null
@@ -1,279 +0,0 @@
-#ifndef LM_VOCAB_H
-#define LM_VOCAB_H
-
-#include "lm/enumerate_vocab.hh"
-#include "lm/lm_exception.hh"
-#include "lm/virtual_interface.hh"
-#include "util/file_stream.hh"
-#include "util/murmur_hash.hh"
-#include "util/pool.hh"
-#include "util/probing_hash_table.hh"
-#include "util/sorted_uniform.hh"
-#include "util/string_piece.hh"
-
-#include <limits>
-#include <string>
-#include <vector>
-
-namespace lm {
-struct ProbBackoff;
-class EnumerateVocab;
-
-namespace ngram {
-struct Config;
-
-namespace detail {
-uint64_t HashForVocab(const char *str, std::size_t len);
-inline uint64_t HashForVocab(const StringPiece &str) {
-  return HashForVocab(str.data(), str.length());
-}
-struct ProbingVocabularyHeader;
-} // namespace detail
-
-// Writes words immediately to a file instead of buffering, because we know
-// where in the file to put them.
-class ImmediateWriteWordsWrapper : public EnumerateVocab {
-  public:
-    ImmediateWriteWordsWrapper(EnumerateVocab *inner, int fd, uint64_t start);
-
-    void Add(WordIndex index, const StringPiece &str) {
-      stream_ << str << '\0';
-      if (inner_) inner_->Add(index, str);
-    }
-
-  private:
-    EnumerateVocab *inner_;
-
-    util::FileStream stream_;
-};
-
-// When the binary size isn't known yet.
-class WriteWordsWrapper : public EnumerateVocab {
-  public:
-    WriteWordsWrapper(EnumerateVocab *inner);
-
-    void Add(WordIndex index, const StringPiece &str);
-
-    const std::string &Buffer() const { return buffer_; }
-    void Write(int fd, uint64_t start);
-
-  private:
-    EnumerateVocab *inner_;
-
-    std::string buffer_;
-};
-
-// Vocabulary based on sorted uniform find storing only uint64_t values and using their offsets as indices.
-class SortedVocabulary : public base::Vocabulary {
-  public:
-    SortedVocabulary();
-
-    WordIndex Index(const StringPiece &str) const {
-      const uint64_t *found;
-      if (util::BoundedSortedUniformFind<const uint64_t*, util::IdentityAccessor<uint64_t>, util::Pivot64>(
-            util::IdentityAccessor<uint64_t>(),
-            begin_ - 1, 0,
-            end_, std::numeric_limits<uint64_t>::max(),
-            detail::HashForVocab(str), found)) {
-        return found - begin_ + 1; // +1 because <unk> is 0 and does not appear in the lookup table.
-      } else {
-        return 0;
-      }
-    }
-
-    // Size for purposes of file writing
-    static uint64_t Size(uint64_t entries, const Config &config);
-
-    /* Read null-delimited words from file from_words, renumber according to
-     * hash order, write null-delimited words to to_words, and create a mapping
-     * from old id to new id.  The 0th vocab word must be <unk>.
-     */
-    static void ComputeRenumbering(WordIndex types, int from_words, int to_words, std::vector<WordIndex> &mapping);
-
-    // Vocab words are [0, Bound())  Only valid after FinishedLoading/LoadedBinary.
-    WordIndex Bound() const { return bound_; }
-
-    // Everything else is for populating.  I'm too lazy to hide and friend these, but you'll only get a const reference anyway.
-    void SetupMemory(void *start, std::size_t allocated, std::size_t entries, const Config &config);
-
-    void Relocate(void *new_start);
-
-    void ConfigureEnumerate(EnumerateVocab *to, std::size_t max_entries);
-
-    // Insert and FinishedLoading go together.
-    WordIndex Insert(const StringPiece &str);
-    // Reorders reorder_vocab so that the IDs are sorted.
-    void FinishedLoading(ProbBackoff *reorder_vocab);
-
-    // Trie stores the correct counts including <unk> in the header.  If this was previously sized based on a count exluding <unk>, padding with 8 bytes will make it the correct size based on a count including <unk>.
-    std::size_t UnkCountChangePadding() const { return SawUnk() ? 0 : sizeof(uint64_t); }
-
-    bool SawUnk() const { return saw_unk_; }
-
-    void LoadedBinary(bool have_words, int fd, EnumerateVocab *to, uint64_t offset);
-
-    uint64_t *&EndHack() { return end_; }
-
-    void Populated();
-
-  private:
-    template <class T> void GenericFinished(T *reorder);
-
-    uint64_t *begin_, *end_;
-
-    WordIndex bound_;
-
-    bool saw_unk_;
-
-    EnumerateVocab *enumerate_;
-
-    // Actual strings.  Used only when loading from ARPA and enumerate_ != NULL
-    util::Pool string_backing_;
-
-    std::vector<StringPiece> strings_to_enumerate_;
-};
-
-#pragma pack(push)
-#pragma pack(4)
-struct ProbingVocabularyEntry {
-  uint64_t key;
-  WordIndex value;
-
-  typedef uint64_t Key;
-  uint64_t GetKey() const { return key; }
-  void SetKey(uint64_t to) { key = to; }
-
-  static ProbingVocabularyEntry Make(uint64_t key, WordIndex value) {
-    ProbingVocabularyEntry ret;
-    ret.key = key;
-    ret.value = value;
-    return ret;
-  }
-};
-#pragma pack(pop)
-
-// Vocabulary storing a map from uint64_t to WordIndex.
-class ProbingVocabulary : public base::Vocabulary {
-  public:
-    ProbingVocabulary();
-
-    WordIndex Index(const StringPiece &str) const {
-      Lookup::ConstIterator i;
-      return lookup_.Find(detail::HashForVocab(str), i) ? i->value : 0;
-    }
-
-    static uint64_t Size(uint64_t entries, float probing_multiplier);
-    // This just unwraps Config to get the probing_multiplier.
-    static uint64_t Size(uint64_t entries, const Config &config);
-
-    // Vocab words are [0, Bound()).
-    WordIndex Bound() const { return bound_; }
-
-    // Everything else is for populating.  I'm too lazy to hide and friend these, but you'll only get a const reference anyway.
-    void SetupMemory(void *start, std::size_t allocated);
-    void SetupMemory(void *start, std::size_t allocated, std::size_t /*entries*/, const Config &/*config*/) {
-      SetupMemory(start, allocated);
-    }
-
-    void Relocate(void *new_start);
-
-    void ConfigureEnumerate(EnumerateVocab *to, std::size_t max_entries);
-
-    WordIndex Insert(const StringPiece &str);
-
-    template <class Weights> void FinishedLoading(Weights * /*reorder_vocab*/) {
-      InternalFinishedLoading();
-    }
-
-    std::size_t UnkCountChangePadding() const { return 0; }
-
-    bool SawUnk() const { return saw_unk_; }
-
-    void LoadedBinary(bool have_words, int fd, EnumerateVocab *to, uint64_t offset);
-
-  private:
-    void InternalFinishedLoading();
-
-    typedef util::ProbingHashTable<ProbingVocabularyEntry, util::IdentityHash> Lookup;
-
-    Lookup lookup_;
-
-    WordIndex bound_;
-
-    bool saw_unk_;
-
-    EnumerateVocab *enumerate_;
-
-    detail::ProbingVocabularyHeader *header_;
-};
-
-void MissingUnknown(const Config &config) throw(SpecialWordMissingException);
-void MissingSentenceMarker(const Config &config, const char *str) throw(SpecialWordMissingException);
-
-template <class Vocab> void CheckSpecials(const Config &config, const Vocab &vocab) throw(SpecialWordMissingException) {
-  if (!vocab.SawUnk()) MissingUnknown(config);
-  if (vocab.BeginSentence() == vocab.NotFound()) MissingSentenceMarker(config, "<s>");
-  if (vocab.EndSentence() == vocab.NotFound()) MissingSentenceMarker(config, "</s>");
-}
-
-class WriteUniqueWords {
-  public:
-    explicit WriteUniqueWords(int fd) : word_list_(fd) {}
-
-    void operator()(const StringPiece &word) {
-      word_list_ << word << '\0';
-    }
-
-  private:
-    util::FileStream word_list_;
-};
-
-class NoOpUniqueWords {
-  public:
-    NoOpUniqueWords() {}
-    void operator()(const StringPiece &word) {}
-};
-
-template <class NewWordAction = NoOpUniqueWords> class GrowableVocab {
-  public:
-    static std::size_t MemUsage(WordIndex content) {
-      return Lookup::MemUsage(content > 2 ? content : 2);
-    }
-
-    // Does not take ownership of write_wordi
-    template <class NewWordConstruct> GrowableVocab(WordIndex initial_size, const NewWordConstruct &new_word_construct = NewWordAction())
-      : lookup_(initial_size), new_word_(new_word_construct) {
-      FindOrInsert("<unk>"); // Force 0
-      FindOrInsert("<s>"); // Force 1
-      FindOrInsert("</s>"); // Force 2
-    }
-
-    WordIndex Index(const StringPiece &str) const {
-      Lookup::ConstIterator i;
-      return lookup_.Find(detail::HashForVocab(str), i) ? i->value : 0;
-    }
-
-    WordIndex FindOrInsert(const StringPiece &word) {
-      ProbingVocabularyEntry entry = ProbingVocabularyEntry::Make(util::MurmurHashNative(word.data(), word.size()), Size());
-      Lookup::MutableIterator it;
-      if (!lookup_.FindOrInsert(entry, it)) {
-        new_word_(word);
-        UTIL_THROW_IF(Size() >= std::numeric_limits<lm::WordIndex>::max(), VocabLoadException, "Too many vocabulary words.  Change WordIndex to uint64_t in lm/word_index.hh");
-      }
-      return it->value;
-    }
-
-    WordIndex Size() const { return lookup_.Size(); }
-
-  private:
-    typedef util::AutoProbing<ProbingVocabularyEntry, util::IdentityHash> Lookup;
-
-    Lookup lookup_;
-
-    NewWordAction new_word_;
-};
-
-} // namespace ngram
-} // namespace lm
-
-#endif // LM_VOCAB_H

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/weights.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/weights.hh b/ext/kenlm/lm/weights.hh
deleted file mode 100644
index f143127..0000000
--- a/ext/kenlm/lm/weights.hh
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef LM_WEIGHTS_H
-#define LM_WEIGHTS_H
-
-// Weights for n-grams.  Probability and possibly a backoff.
-
-namespace lm {
-struct Prob {
-  float prob;
-};
-// No inheritance so this will be a POD.
-struct ProbBackoff {
-  float prob;
-  float backoff;
-};
-struct RestWeights {
-  float prob;
-  float backoff;
-  float rest;
-};
-
-} // namespace lm
-#endif // LM_WEIGHTS_H

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/word_index.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/word_index.hh b/ext/kenlm/lm/word_index.hh
deleted file mode 100644
index 59b24d7..0000000
--- a/ext/kenlm/lm/word_index.hh
+++ /dev/null
@@ -1,15 +0,0 @@
-// Separate header because this is used often.
-#ifndef LM_WORD_INDEX_H
-#define LM_WORD_INDEX_H
-
-#include <climits>
-
-namespace lm {
-typedef unsigned int WordIndex;
-const WordIndex kMaxWordIndex = UINT_MAX;
-const WordIndex kUNK = 0;
-} // namespace lm
-
-typedef lm::WordIndex LMWordIndex;
-
-#endif

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/wrappers/README
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/wrappers/README b/ext/kenlm/lm/wrappers/README
deleted file mode 100644
index 56c34c2..0000000
--- a/ext/kenlm/lm/wrappers/README
+++ /dev/null
@@ -1,3 +0,0 @@
-This directory is for wrappers around other people's LMs, presenting an interface similar to KenLM's.  You will need to have their LM installed.
-
-NPLM is a work in progress.  

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/wrappers/nplm.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/wrappers/nplm.cc b/ext/kenlm/lm/wrappers/nplm.cc
deleted file mode 100644
index 9bd7c1e..0000000
--- a/ext/kenlm/lm/wrappers/nplm.cc
+++ /dev/null
@@ -1,116 +0,0 @@
-#include "lm/wrappers/nplm.hh"
-#include "util/exception.hh"
-#include "util/file.hh"
-
-#include <algorithm>
-#include <cstring>
-
-#include "neuralLM.h"
-
-namespace lm {
-namespace np {
-
-Vocabulary::Vocabulary(const nplm::vocabulary &vocab)
-  : base::Vocabulary(vocab.lookup_word("<s>"), vocab.lookup_word("</s>"), vocab.lookup_word("<unk>")),
-    vocab_(vocab), null_word_(vocab.lookup_word("<null>")) {}
-
-Vocabulary::~Vocabulary() {}
-
-WordIndex Vocabulary::Index(const std::string &str) const {
-  return vocab_.lookup_word(str);
-}
-
-class Backend {
-  public:
-    Backend(const nplm::neuralLM &from, const std::size_t cache_size) : lm_(from), ngram_(from.get_order()) {
-      lm_.set_cache(cache_size);
-    }
-
-    nplm::neuralLM &LM() { return lm_; }
-    const nplm::neuralLM &LM() const { return lm_; }
-
-    Eigen::Matrix<int,Eigen::Dynamic,1> &staging_ngram() { return ngram_; }
-
-    double lookup_from_staging() { return lm_.lookup_ngram(ngram_); }
-
-    int order() const { return lm_.get_order(); }
-
-  private:
-    nplm::neuralLM lm_;
-    Eigen::Matrix<int,Eigen::Dynamic,1> ngram_;
-};
-
-bool Model::Recognize(const std::string &name) {
-  try {
-    util::scoped_fd file(util::OpenReadOrThrow(name.c_str()));
-    char magic_check[16];
-    util::ReadOrThrow(file.get(), magic_check, sizeof(magic_check));
-    const char nnlm_magic[] = "\\config\nversion ";
-    return !memcmp(magic_check, nnlm_magic, 16);
-  } catch (const util::Exception &) {
-    return false;
-  }
-}
-
-namespace {
-nplm::neuralLM *LoadNPLM(const std::string &file) {
-  util::scoped_ptr<nplm::neuralLM> ret(new nplm::neuralLM());
-  ret->read(file);
-  return ret.release();
-}
-} // namespace
-
-Model::Model(const std::string &file, std::size_t cache)
-  : base_instance_(LoadNPLM(file)), vocab_(base_instance_->get_vocabulary()), cache_size_(cache) {
-  UTIL_THROW_IF(base_instance_->get_order() > NPLM_MAX_ORDER, util::Exception, "This NPLM has order " << (unsigned int)base_instance_->get_order() << " but the KenLM wrapper was compiled with " << NPLM_MAX_ORDER << ".  Change the defintion of NPLM_MAX_ORDER and recompile.");
-  // log10 compatible with backoff models.
-  base_instance_->set_log_base(10.0);
-  State begin_sentence, null_context;
-  std::fill(begin_sentence.words, begin_sentence.words + NPLM_MAX_ORDER - 1, base_instance_->lookup_word("<s>"));
-  null_word_ = base_instance_->lookup_word("<null>");
-  std::fill(null_context.words, null_context.words + NPLM_MAX_ORDER - 1, null_word_);
-
-  Init(begin_sentence, null_context, vocab_, base_instance_->get_order());
-}
-
-Model::~Model() {}
-
-FullScoreReturn Model::FullScore(const State &from, const WordIndex new_word, State &out_state) const {
-  Backend *backend = backend_.get();
-  if (!backend) {
-    backend = new Backend(*base_instance_, cache_size_);
-    backend_.reset(backend);
-  }
-  // State is in natural word order.
-  FullScoreReturn ret;
-  for (int i = 0; i < backend->order() - 1; ++i) {
-    backend->staging_ngram()(i) = from.words[i];
-  }
-  backend->staging_ngram()(backend->order() - 1) = new_word;
-  ret.prob = backend->lookup_from_staging();
-  // Always say full order.
-  ret.ngram_length = backend->order();
-  // Shift everything down by one.
-  memcpy(out_state.words, from.words + 1, sizeof(WordIndex) * (backend->order() - 2));
-  out_state.words[backend->order() - 2] = new_word;
-  // Fill in trailing words with zeros so state comparison works.
-  memset(out_state.words + backend->order() - 1, 0, sizeof(WordIndex) * (NPLM_MAX_ORDER - backend->order()));
-  return ret;
-}
-
-// TODO: optimize with direct call?
-FullScoreReturn Model::FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const {
-  // State is in natural word order.  The API here specifies reverse order.
-  std::size_t state_length = std::min<std::size_t>(Order() - 1, context_rend - context_rbegin);
-  State state;
-  // Pad with null words.
-  for (lm::WordIndex *i = state.words; i < state.words + Order() - 1 - state_length; ++i) {
-    *i = null_word_;
-  }
-  // Put new words at the end.
-  std::reverse_copy(context_rbegin, context_rbegin + state_length, state.words + Order() - 1 - state_length);
-  return FullScore(state, new_word, out_state);
-}
-
-} // namespace np
-} // namespace lm

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/wrappers/nplm.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/wrappers/nplm.hh b/ext/kenlm/lm/wrappers/nplm.hh
deleted file mode 100644
index 82b38fd..0000000
--- a/ext/kenlm/lm/wrappers/nplm.hh
+++ /dev/null
@@ -1,85 +0,0 @@
-#ifndef LM_WRAPPERS_NPLM_H
-#define LM_WRAPPERS_NPLM_H
-
-#include "lm/facade.hh"
-#include "lm/max_order.hh"
-#include "util/string_piece.hh"
-
-#include <boost/thread/tss.hpp>
-#include <boost/scoped_ptr.hpp>
-
-/* Wrapper to NPLM "by Ashish Vaswani, with contributions from David Chiang
- * and Victoria Fossum."
- * http://nlg.isi.edu/software/nplm/
- */
-
-namespace nplm {
-class vocabulary;
-class neuralLM;
-} // namespace nplm
-
-namespace lm {
-namespace np {
-
-class Vocabulary : public base::Vocabulary {
-  public:
-    Vocabulary(const nplm::vocabulary &vocab);
-
-    ~Vocabulary();
-
-    WordIndex Index(const std::string &str) const;
-
-    // TODO: lobby them to support StringPiece
-    WordIndex Index(const StringPiece &str) const {
-      return Index(std::string(str.data(), str.size()));
-    }
-
-    lm::WordIndex NullWord() const { return null_word_; }
-
-  private:
-    const nplm::vocabulary &vocab_;
-
-    const lm::WordIndex null_word_;
-};
-
-// Sorry for imposing my limitations on your code.
-#define NPLM_MAX_ORDER 7
-
-struct State {
-  WordIndex words[NPLM_MAX_ORDER - 1];
-};
-
-class Backend;
-
-class Model : public lm::base::ModelFacade<Model, State, Vocabulary> {
-  private:
-    typedef lm::base::ModelFacade<Model, State, Vocabulary> P;
-
-  public:
-    // Does this look like an NPLM?
-    static bool Recognize(const std::string &file);
-
-    explicit Model(const std::string &file, std::size_t cache_size = 1 << 20);
-
-    ~Model();
-
-    FullScoreReturn FullScore(const State &from, const WordIndex new_word, State &out_state) const;
-
-    FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const;
-
-  private:
-    boost::scoped_ptr<nplm::neuralLM> base_instance_;
-
-    mutable boost::thread_specific_ptr<Backend> backend_;
-
-    Vocabulary vocab_;
-
-    lm::WordIndex null_word_;
-
-    const std::size_t cache_size_;
-};
-
-} // namespace np
-} // namespace lm
-
-#endif // LM_WRAPPERS_NPLM_H

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/python/_kenlm.pxd
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/python/_kenlm.pxd b/ext/kenlm/python/_kenlm.pxd
deleted file mode 100644
index e0c0248..0000000
--- a/ext/kenlm/python/_kenlm.pxd
+++ /dev/null
@@ -1,33 +0,0 @@
-cdef extern from "lm/word_index.hh" namespace "lm":
-    ctypedef unsigned WordIndex
-
-cdef extern from "lm/return.hh" namespace "lm":
-    cdef struct FullScoreReturn:
-        float prob
-        unsigned char ngram_length
-
-cdef extern from "lm/state.hh" namespace "lm::ngram":
-    cdef cppclass State :
-        int Compare(const State &other) const
-
-    int hash_value(const State &state) 
-
-cdef extern from "lm/virtual_interface.hh" namespace "lm::base":
-    cdef cppclass Vocabulary:
-        WordIndex Index(char*)
-        WordIndex BeginSentence() 
-        WordIndex EndSentence()
-        WordIndex NotFound()
-
-    ctypedef Vocabulary const_Vocabulary "const lm::base::Vocabulary"
-
-    cdef cppclass Model:
-        void BeginSentenceWrite(void *)
-        void NullContextWrite(void *)
-        unsigned int Order()
-        const_Vocabulary& BaseVocabulary()
-        float BaseScore(void *in_state, WordIndex new_word, void *out_state)
-        FullScoreReturn BaseFullScore(void *in_state, WordIndex new_word, void *out_state)
-
-cdef extern from "lm/model.hh" namespace "lm::ngram":
-    cdef Model *LoadVirtual(char *) except +

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/python/example.py
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/python/example.py b/ext/kenlm/python/example.py
deleted file mode 100644
index 8a18f3a..0000000
--- a/ext/kenlm/python/example.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import os
-import kenlm
-
-LM = os.path.join(os.path.dirname(__file__), '..', 'lm', 'test.arpa')
-model = kenlm.LanguageModel(LM)
-print('{0}-gram model'.format(model.order))
-
-sentence = 'language modeling is fun .'
-print(sentence)
-print(model.score(sentence))
-
-# Check that total full score = direct score
-def score(s):
-    return sum(prob for prob, _, _ in model.full_scores(s))
-
-assert (abs(score(sentence) - model.score(sentence)) < 1e-3)
-
-# Show scores and n-gram matches
-words = ['<s>'] + sentence.split() + ['</s>']
-for i, (prob, length, oov) in enumerate(model.full_scores(sentence)):
-    print('{0} {1}: {2}'.format(prob, length, ' '.join(words[i+2-length:i+2])))
-    if oov:
-        print('\t"{0}" is an OOV'.format(words[i+1]))
-
-# Find out-of-vocabulary words
-for w in words:
-    if not w in model:
-        print('"{0}" is an OOV'.format(w))