You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/19 21:34:03 UTC
[15/51] [partial] incubator-joshua git commit: Converted KenLM into a
submodule
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/binary_format.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/binary_format.cc b/ext/kenlm/lm/binary_format.cc
deleted file mode 100644
index 802943f..0000000
--- a/ext/kenlm/lm/binary_format.cc
+++ /dev/null
@@ -1,302 +0,0 @@
-#include "lm/binary_format.hh"
-
-#include "lm/lm_exception.hh"
-#include "util/file.hh"
-#include "util/file_piece.hh"
-
-#include <cstddef>
-#include <cstring>
-#include <limits>
-#include <string>
-#include <cstdlib>
-
-#include <stdint.h>
-
-namespace lm {
-namespace ngram {
-
-const char *kModelNames[6] = {"probing hash tables", "probing hash tables with rest costs", "trie", "trie with quantization", "trie with array-compressed pointers", "trie with quantization and array-compressed pointers"};
-
-namespace {
-const char kMagicBeforeVersion[] = "mmap lm http://kheafield.com/code format version";
-const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 5\n\0";
-// This must be shorter than kMagicBytes and indicates an incomplete binary file (i.e. build failed).
-const char kMagicIncomplete[] = "mmap lm http://kheafield.com/code incomplete\n";
-const long int kMagicVersion = 5;
-
-// Old binary files built on 32-bit machines have this header.
-// TODO: eliminate with next binary release.
-struct OldSanity {
- char magic[sizeof(kMagicBytes)];
- float zero_f, one_f, minus_half_f;
- WordIndex one_word_index, max_word_index;
- uint64_t one_uint64;
-
- void SetToReference() {
- std::memset(this, 0, sizeof(OldSanity));
- std::memcpy(magic, kMagicBytes, sizeof(magic));
- zero_f = 0.0; one_f = 1.0; minus_half_f = -0.5;
- one_word_index = 1;
- max_word_index = std::numeric_limits<WordIndex>::max();
- one_uint64 = 1;
- }
-};
-
-
-// Test values aligned to 8 bytes.
-struct Sanity {
- char magic[ALIGN8(sizeof(kMagicBytes))];
- float zero_f, one_f, minus_half_f;
- WordIndex one_word_index, max_word_index, padding_to_8;
- uint64_t one_uint64;
-
- void SetToReference() {
- std::memset(this, 0, sizeof(Sanity));
- std::memcpy(magic, kMagicBytes, sizeof(kMagicBytes));
- zero_f = 0.0; one_f = 1.0; minus_half_f = -0.5;
- one_word_index = 1;
- max_word_index = std::numeric_limits<WordIndex>::max();
- padding_to_8 = 0;
- one_uint64 = 1;
- }
-};
-
-std::size_t TotalHeaderSize(unsigned char order) {
- return ALIGN8(sizeof(Sanity) + sizeof(FixedWidthParameters) + sizeof(uint64_t) * order);
-}
-
-void WriteHeader(void *to, const Parameters ¶ms) {
- Sanity header = Sanity();
- header.SetToReference();
- std::memcpy(to, &header, sizeof(Sanity));
- char *out = reinterpret_cast<char*>(to) + sizeof(Sanity);
-
- *reinterpret_cast<FixedWidthParameters*>(out) = params.fixed;
- out += sizeof(FixedWidthParameters);
-
- uint64_t *counts = reinterpret_cast<uint64_t*>(out);
- for (std::size_t i = 0; i < params.counts.size(); ++i) {
- counts[i] = params.counts[i];
- }
-}
-
-} // namespace
-
-bool IsBinaryFormat(int fd) {
- const uint64_t size = util::SizeFile(fd);
- if (size == util::kBadSize || (size <= static_cast<uint64_t>(sizeof(Sanity)))) return false;
- // Try reading the header.
- util::scoped_memory memory;
- try {
- util::MapRead(util::LAZY, fd, 0, sizeof(Sanity), memory);
- } catch (const util::Exception &e) {
- return false;
- }
- Sanity reference_header = Sanity();
- reference_header.SetToReference();
- if (!std::memcmp(memory.get(), &reference_header, sizeof(Sanity))) return true;
- if (!std::memcmp(memory.get(), kMagicIncomplete, strlen(kMagicIncomplete))) {
- UTIL_THROW(FormatLoadException, "This binary file did not finish building");
- }
- if (!std::memcmp(memory.get(), kMagicBeforeVersion, strlen(kMagicBeforeVersion))) {
- char *end_ptr;
- const char *begin_version = static_cast<const char*>(memory.get()) + strlen(kMagicBeforeVersion);
- long int version = std::strtol(begin_version, &end_ptr, 10);
- if ((end_ptr != begin_version) && version != kMagicVersion) {
- UTIL_THROW(FormatLoadException, "Binary file has version " << version << " but this implementation expects version " << kMagicVersion << " so you'll have to use the ARPA to rebuild your binary");
- }
-
- OldSanity old_sanity = OldSanity();
- old_sanity.SetToReference();
- UTIL_THROW_IF(!std::memcmp(memory.get(), &old_sanity, sizeof(OldSanity)), FormatLoadException, "Looks like this is an old 32-bit format. The old 32-bit format has been removed so that 64-bit and 32-bit files are exchangeable.");
- UTIL_THROW(FormatLoadException, "File looks like it should be loaded with mmap, but the test values don't match. Try rebuilding the binary format LM using the same code revision, compiler, and architecture");
- }
- return false;
-}
-
-void ReadHeader(int fd, Parameters &out) {
- util::SeekOrThrow(fd, sizeof(Sanity));
- util::ReadOrThrow(fd, &out.fixed, sizeof(out.fixed));
- if (out.fixed.probing_multiplier < 1.0)
- UTIL_THROW(FormatLoadException, "Binary format claims to have a probing multiplier of " << out.fixed.probing_multiplier << " which is < 1.0.");
-
- out.counts.resize(static_cast<std::size_t>(out.fixed.order));
- if (out.fixed.order) util::ReadOrThrow(fd, &*out.counts.begin(), sizeof(uint64_t) * out.fixed.order);
-}
-
-void MatchCheck(ModelType model_type, unsigned int search_version, const Parameters ¶ms) {
- if (params.fixed.model_type != model_type) {
- if (static_cast<unsigned int>(params.fixed.model_type) >= (sizeof(kModelNames) / sizeof(const char *)))
- UTIL_THROW(FormatLoadException, "The binary file claims to be model type " << static_cast<unsigned int>(params.fixed.model_type) << " but this is not implemented for in this inference code.");
- UTIL_THROW(FormatLoadException, "The binary file was built for " << kModelNames[params.fixed.model_type] << " but the inference code is trying to load " << kModelNames[model_type]);
- }
- UTIL_THROW_IF(search_version != params.fixed.search_version, FormatLoadException, "The binary file has " << kModelNames[params.fixed.model_type] << " version " << params.fixed.search_version << " but this code expects " << kModelNames[params.fixed.model_type] << " version " << search_version);
-}
-
-const std::size_t kInvalidSize = static_cast<std::size_t>(-1);
-
-BinaryFormat::BinaryFormat(const Config &config)
- : write_method_(config.write_method), write_mmap_(config.write_mmap), load_method_(config.load_method),
- header_size_(kInvalidSize), vocab_size_(kInvalidSize), vocab_string_offset_(kInvalidOffset) {}
-
-void BinaryFormat::InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters ¶ms) {
- file_.reset(fd);
- write_mmap_ = NULL; // Ignore write requests; this is already in binary format.
- ReadHeader(fd, params);
- MatchCheck(model_type, search_version, params);
- header_size_ = TotalHeaderSize(params.counts.size());
-}
-
-void BinaryFormat::ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const {
- assert(header_size_ != kInvalidSize);
- util::ErsatzPRead(file_.get(), to, amount, offset_excluding_header + header_size_);
-}
-
-void *BinaryFormat::LoadBinary(std::size_t size) {
- assert(header_size_ != kInvalidSize);
- const uint64_t file_size = util::SizeFile(file_.get());
- // The header is smaller than a page, so we have to map the whole header as well.
- uint64_t total_map = static_cast<uint64_t>(header_size_) + static_cast<uint64_t>(size);
- UTIL_THROW_IF(file_size != util::kBadSize && file_size < total_map, FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map);
-
- util::MapRead(load_method_, file_.get(), 0, util::CheckOverflow(total_map), mapping_);
-
- vocab_string_offset_ = total_map;
- return reinterpret_cast<uint8_t*>(mapping_.get()) + header_size_;
-}
-
-void *BinaryFormat::SetupJustVocab(std::size_t memory_size, uint8_t order) {
- vocab_size_ = memory_size;
- if (!write_mmap_) {
- header_size_ = 0;
- util::HugeMalloc(memory_size, true, memory_vocab_);
- return reinterpret_cast<uint8_t*>(memory_vocab_.get());
- }
- header_size_ = TotalHeaderSize(order);
- std::size_t total = util::CheckOverflow(static_cast<uint64_t>(header_size_) + static_cast<uint64_t>(memory_size));
- file_.reset(util::CreateOrThrow(write_mmap_));
- // some gccs complain about uninitialized variables even though all enum values are covered.
- void *vocab_base = NULL;
- switch (write_method_) {
- case Config::WRITE_MMAP:
- mapping_.reset(util::MapZeroedWrite(file_.get(), total), total, util::scoped_memory::MMAP_ALLOCATED);
- util::AdviseHugePages(vocab_base, total);
- vocab_base = mapping_.get();
- break;
- case Config::WRITE_AFTER:
- util::ResizeOrThrow(file_.get(), 0);
- util::HugeMalloc(total, true, memory_vocab_);
- vocab_base = memory_vocab_.get();
- break;
- }
- strncpy(reinterpret_cast<char*>(vocab_base), kMagicIncomplete, header_size_);
- return reinterpret_cast<uint8_t*>(vocab_base) + header_size_;
-}
-
-void *BinaryFormat::GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base) {
- assert(vocab_size_ != kInvalidSize);
- vocab_pad_ = vocab_pad;
- std::size_t new_size = header_size_ + vocab_size_ + vocab_pad_ + memory_size;
- vocab_string_offset_ = new_size;
- if (!write_mmap_ || write_method_ == Config::WRITE_AFTER) {
- util::HugeMalloc(memory_size, true, memory_search_);
- assert(header_size_ == 0 || write_mmap_);
- vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get()) + header_size_;
- util::AdviseHugePages(memory_search_.get(), memory_size);
- return reinterpret_cast<uint8_t*>(memory_search_.get());
- }
-
- assert(write_method_ == Config::WRITE_MMAP);
- // Also known as total size without vocab words.
- // Grow the file to accomodate the search, using zeros.
- // According to man mmap, behavior is undefined when the file is resized
- // underneath a mmap that is not a multiple of the page size. So to be
- // safe, we'll unmap it and map it again.
- mapping_.reset();
- util::ResizeOrThrow(file_.get(), new_size);
- void *ret;
- MapFile(vocab_base, ret);
- util::AdviseHugePages(ret, new_size);
- return ret;
-}
-
-void BinaryFormat::WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base) {
- // Checking Config's include_vocab is the responsibility of the caller.
- assert(header_size_ != kInvalidSize && vocab_size_ != kInvalidSize);
- if (!write_mmap_) {
- // Unchanged base.
- vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get());
- search_base = reinterpret_cast<uint8_t*>(memory_search_.get());
- return;
- }
- if (write_method_ == Config::WRITE_MMAP) {
- mapping_.reset();
- }
- util::SeekOrThrow(file_.get(), VocabStringReadingOffset());
- util::WriteOrThrow(file_.get(), &buffer[0], buffer.size());
- if (write_method_ == Config::WRITE_MMAP) {
- MapFile(vocab_base, search_base);
- } else {
- vocab_base = reinterpret_cast<uint8_t*>(memory_vocab_.get()) + header_size_;
- search_base = reinterpret_cast<uint8_t*>(memory_search_.get());
- }
-}
-
-void BinaryFormat::FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts) {
- if (!write_mmap_) return;
- switch (write_method_) {
- case Config::WRITE_MMAP:
- util::SyncOrThrow(mapping_.get(), mapping_.size());
- break;
- case Config::WRITE_AFTER:
- util::SeekOrThrow(file_.get(), 0);
- util::WriteOrThrow(file_.get(), memory_vocab_.get(), memory_vocab_.size());
- util::SeekOrThrow(file_.get(), header_size_ + vocab_size_ + vocab_pad_);
- util::WriteOrThrow(file_.get(), memory_search_.get(), memory_search_.size());
- util::FSyncOrThrow(file_.get());
- break;
- }
- // header and vocab share the same mmap.
- Parameters params = Parameters();
- memset(¶ms, 0, sizeof(Parameters));
- params.counts = counts;
- params.fixed.order = counts.size();
- params.fixed.probing_multiplier = config.probing_multiplier;
- params.fixed.model_type = model_type;
- params.fixed.has_vocabulary = config.include_vocab;
- params.fixed.search_version = search_version;
- switch (write_method_) {
- case Config::WRITE_MMAP:
- WriteHeader(mapping_.get(), params);
- util::SyncOrThrow(mapping_.get(), mapping_.size());
- break;
- case Config::WRITE_AFTER:
- {
- std::vector<uint8_t> buffer(TotalHeaderSize(counts.size()));
- WriteHeader(&buffer[0], params);
- util::SeekOrThrow(file_.get(), 0);
- util::WriteOrThrow(file_.get(), &buffer[0], buffer.size());
- }
- break;
- }
-}
-
-void BinaryFormat::MapFile(void *&vocab_base, void *&search_base) {
- mapping_.reset(util::MapOrThrow(vocab_string_offset_, true, util::kFileFlags, false, file_.get()), vocab_string_offset_, util::scoped_memory::MMAP_ALLOCATED);
- vocab_base = reinterpret_cast<uint8_t*>(mapping_.get()) + header_size_;
- search_base = reinterpret_cast<uint8_t*>(mapping_.get()) + header_size_ + vocab_size_ + vocab_pad_;
-}
-
-bool RecognizeBinary(const char *file, ModelType &recognized) {
- util::scoped_fd fd(util::OpenReadOrThrow(file));
- if (!IsBinaryFormat(fd.get())) {
- return false;
- }
- Parameters params;
- ReadHeader(fd.get(), params);
- recognized = params.fixed.model_type;
- return true;
-}
-
-} // namespace ngram
-} // namespace lm
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/binary_format.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/binary_format.hh b/ext/kenlm/lm/binary_format.hh
deleted file mode 100644
index ff99b95..0000000
--- a/ext/kenlm/lm/binary_format.hh
+++ /dev/null
@@ -1,106 +0,0 @@
-#ifndef LM_BINARY_FORMAT_H
-#define LM_BINARY_FORMAT_H
-
-#include "lm/config.hh"
-#include "lm/model_type.hh"
-#include "lm/read_arpa.hh"
-
-#include "util/file_piece.hh"
-#include "util/mmap.hh"
-#include "util/scoped.hh"
-
-#include <cstddef>
-#include <vector>
-
-#include <stdint.h>
-
-namespace lm {
-namespace ngram {
-
-extern const char *kModelNames[6];
-
-/*Inspect a file to determine if it is a binary lm. If not, return false.
- * If so, return true and set recognized to the type. This is the only API in
- * this header designed for use by decoder authors.
- */
-bool RecognizeBinary(const char *file, ModelType &recognized);
-
-struct FixedWidthParameters {
- unsigned char order;
- float probing_multiplier;
- // What type of model is this?
- ModelType model_type;
- // Does the end of the file have the actual strings in the vocabulary?
- bool has_vocabulary;
- unsigned int search_version;
-};
-
-// This is a macro instead of an inline function so constants can be assigned using it.
-#define ALIGN8(a) ((std::ptrdiff_t(((a)-1)/8)+1)*8)
-
-// Parameters stored in the header of a binary file.
-struct Parameters {
- FixedWidthParameters fixed;
- std::vector<uint64_t> counts;
-};
-
-class BinaryFormat {
- public:
- explicit BinaryFormat(const Config &config);
-
- // Reading a binary file:
- // Takes ownership of fd
- void InitializeBinary(int fd, ModelType model_type, unsigned int search_version, Parameters ¶ms);
- // Used to read parts of the file to update the config object before figuring out full size.
- void ReadForConfig(void *to, std::size_t amount, uint64_t offset_excluding_header) const;
- // Actually load the binary file and return a pointer to the beginning of the search area.
- void *LoadBinary(std::size_t size);
-
- uint64_t VocabStringReadingOffset() const {
- assert(vocab_string_offset_ != kInvalidOffset);
- return vocab_string_offset_;
- }
-
- // Writing a binary file or initializing in RAM from ARPA:
- // Size for vocabulary.
- void *SetupJustVocab(std::size_t memory_size, uint8_t order);
- // Warning: can change the vocaulary base pointer.
- void *GrowForSearch(std::size_t memory_size, std::size_t vocab_pad, void *&vocab_base);
- // Warning: can change vocabulary and search base addresses.
- void WriteVocabWords(const std::string &buffer, void *&vocab_base, void *&search_base);
- // Write the header at the beginning of the file.
- void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector<uint64_t> &counts);
-
- private:
- void MapFile(void *&vocab_base, void *&search_base);
-
- // Copied from configuration.
- const Config::WriteMethod write_method_;
- const char *write_mmap_;
- util::LoadMethod load_method_;
-
- // File behind memory, if any.
- util::scoped_fd file_;
-
- // If there is a file involved, a single mapping.
- util::scoped_memory mapping_;
-
- // If the data is only in memory, separately allocate each because the trie
- // knows vocab's size before it knows search's size (because SRILM might
- // have pruned).
- util::scoped_memory memory_vocab_, memory_search_;
-
- // Memory ranges. Note that these may not be contiguous and may not all
- // exist.
- std::size_t header_size_, vocab_size_, vocab_pad_;
- // aka end of search.
- uint64_t vocab_string_offset_;
-
- static const uint64_t kInvalidOffset = (uint64_t)-1;
-};
-
-bool IsBinaryFormat(int fd);
-
-} // namespace ngram
-} // namespace lm
-#endif // LM_BINARY_FORMAT_H
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/blank.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/blank.hh b/ext/kenlm/lm/blank.hh
deleted file mode 100644
index e09054c..0000000
--- a/ext/kenlm/lm/blank.hh
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef LM_BLANK_H
-#define LM_BLANK_H
-
-#include <limits>
-#include <stdint.h>
-#include <cmath>
-
-namespace lm {
-namespace ngram {
-
-/* Suppose "foo bar" appears with zero backoff but there is no trigram
- * beginning with these words. Then, when scoring "foo bar", the model could
- * return out_state containing "bar" or even null context if "bar" also has no
- * backoff and is never followed by another word. Then the backoff is set to
- * kNoExtensionBackoff. If the n-gram might be extended, then out_state must
- * contain the full n-gram, in which case kExtensionBackoff is set. In any
- * case, if an n-gram has non-zero backoff, the full state is returned so
- * backoff can be properly charged.
- * These differ only in sign bit because the backoff is in fact zero in either
- * case.
- */
-const float kNoExtensionBackoff = -0.0;
-const float kExtensionBackoff = 0.0;
-const uint64_t kNoExtensionQuant = 0;
-const uint64_t kExtensionQuant = 1;
-
-inline void SetExtension(float &backoff) {
- if (backoff == kNoExtensionBackoff) backoff = kExtensionBackoff;
-}
-
-// This compiles down nicely.
-inline bool HasExtension(const float &backoff) {
- typedef union { float f; uint32_t i; } UnionValue;
- UnionValue compare, interpret;
- compare.f = kNoExtensionBackoff;
- interpret.f = backoff;
- return compare.i != interpret.i;
-}
-
-} // namespace ngram
-} // namespace lm
-#endif // LM_BLANK_H
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/build_binary_main.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/build_binary_main.cc b/ext/kenlm/lm/build_binary_main.cc
deleted file mode 100644
index 35206e6..0000000
--- a/ext/kenlm/lm/build_binary_main.cc
+++ /dev/null
@@ -1,234 +0,0 @@
-#include "lm/model.hh"
-#include "lm/sizes.hh"
-#include "util/file_piece.hh"
-#include "util/usage.hh"
-
-#include <algorithm>
-#include <cstdlib>
-#include <exception>
-#include <iostream>
-#include <iomanip>
-#include <limits>
-#include <cmath>
-#include <cstdlib>
-
-#ifdef WIN32
-#include "util/getopt.hh"
-#else
-#include <unistd.h>
-#endif
-
-namespace lm {
-namespace ngram {
-namespace {
-
-void Usage(const char *name, const char *default_mem) {
- std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-w mmap|after] [-p probing_multiplier] [-T trie_temporary] [-S trie_building_mem] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n"
-"-u sets the log10 probability for <unk> if the ARPA file does not have one.\n"
-" Default is -100. The ARPA file will always take precedence.\n"
-"-s allows models to be built even if they do not have <s> and </s>.\n"
-"-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n"
-"-w mmap|after determines how writing is done.\n"
-" mmap maps the binary file and writes to it. Default for trie.\n"
-" after allocates anonymous memory, builds, and writes. Default for probing.\n"
-"-r \"order1.arpa order2 order3 order4\" adds lower-order rest costs from these\n"
-" model files. order1.arpa must be an ARPA file. All others may be ARPA or\n"
-" the same data structure as being built. All files must have the same\n"
-" vocabulary. For probing, the unigrams must be in the same order.\n\n"
-"type is either probing or trie. Default is probing.\n\n"
-"probing uses a probing hash table. It is the fastest but uses the most memory.\n"
-"-p sets the space multiplier and must be >1.0. The default is 1.5.\n\n"
-"trie is a straightforward trie with bit-level packing. It uses the least\n"
-"memory and is still faster than SRI or IRST. Building the trie format uses an\n"
-"on-disk sort to save memory.\n"
-"-T is the temporary directory prefix. Default is the output file name.\n"
-"-S determines memory use for sorting. Default is " << default_mem << ". This is compatible\n"
-" with GNU sort. The number is followed by a unit: \% for percent of physical\n"
-" memory, b for bytes, K for Kilobytes, M for megabytes, then G,T,P,E,Z,Y. \n"
-" Default unit is K for Kilobytes.\n"
-"-q turns quantization on and sets the number of bits (e.g. -q 8).\n"
-"-b sets backoff quantization bits. Requires -q and defaults to that value.\n"
-"-a compresses pointers using an array of offsets. The parameter is the\n"
-" maximum number of bits encoded by the array. Memory is minimized subject\n"
-" to the maximum, so pick 255 to minimize memory.\n\n"
-"-h print this help message.\n\n"
-"Get a memory estimate by passing an ARPA file without an output file name.\n";
- exit(1);
-}
-
-// I could really use boost::lexical_cast right about now.
-float ParseFloat(const char *from) {
- char *end;
- float ret = strtod(from, &end);
- if (*end) throw util::ParseNumberException(from);
- return ret;
-}
-unsigned long int ParseUInt(const char *from) {
- char *end;
- unsigned long int ret = strtoul(from, &end, 10);
- if (*end) throw util::ParseNumberException(from);
- return ret;
-}
-
-uint8_t ParseBitCount(const char *from) {
- unsigned long val = ParseUInt(from);
- if (val > 25) {
- util::ParseNumberException e(from);
- e << " bit counts are limited to 25.";
- }
- return val;
-}
-
-void ParseFileList(const char *from, std::vector<std::string> &to) {
- to.clear();
- while (true) {
- const char *i;
- for (i = from; *i && *i != ' '; ++i) {}
- to.push_back(std::string(from, i - from));
- if (!*i) break;
- from = i + 1;
- }
-}
-
-void ProbingQuantizationUnsupported() {
- std::cerr << "Quantization is only implemented in the trie data structure." << std::endl;
- exit(1);
-}
-
-} // namespace ngram
-} // namespace lm
-} // namespace
-
-int main(int argc, char *argv[]) {
- using namespace lm::ngram;
-
- const char *default_mem = util::GuessPhysicalMemory() ? "80%" : "1G";
-
- if (argc == 2 && !strcmp(argv[1], "--help"))
- Usage(argv[0], default_mem);
-
- try {
- bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false, rest = false;
- lm::ngram::Config config;
- config.building_memory = util::ParseSize(default_mem);
- int opt;
- while ((opt = getopt(argc, argv, "q:b:a:u:p:t:T:m:S:w:sir:h")) != -1) {
- switch(opt) {
- case 'q':
- config.prob_bits = ParseBitCount(optarg);
- if (!set_backoff_bits) config.backoff_bits = config.prob_bits;
- quantize = true;
- break;
- case 'b':
- config.backoff_bits = ParseBitCount(optarg);
- set_backoff_bits = true;
- break;
- case 'a':
- config.pointer_bhiksha_bits = ParseBitCount(optarg);
- bhiksha = true;
- break;
- case 'u':
- config.unknown_missing_logprob = ParseFloat(optarg);
- break;
- case 'p':
- config.probing_multiplier = ParseFloat(optarg);
- break;
- case 't': // legacy
- case 'T':
- config.temporary_directory_prefix = optarg;
- util::NormalizeTempPrefix(config.temporary_directory_prefix);
- break;
- case 'm': // legacy
- config.building_memory = ParseUInt(optarg) * 1048576;
- break;
- case 'S':
- config.building_memory = std::min(static_cast<uint64_t>(std::numeric_limits<std::size_t>::max()), util::ParseSize(optarg));
- break;
- case 'w':
- set_write_method = true;
- if (!strcmp(optarg, "mmap")) {
- config.write_method = Config::WRITE_MMAP;
- } else if (!strcmp(optarg, "after")) {
- config.write_method = Config::WRITE_AFTER;
- } else {
- Usage(argv[0], default_mem);
- }
- break;
- case 's':
- config.sentence_marker_missing = lm::SILENT;
- break;
- case 'i':
- config.positive_log_probability = lm::SILENT;
- break;
- case 'r':
- rest = true;
- ParseFileList(optarg, config.rest_lower_files);
- config.rest_function = Config::REST_LOWER;
- break;
- case 'h': // help
- default:
- Usage(argv[0], default_mem);
- }
- }
- if (!quantize && set_backoff_bits) {
- std::cerr << "You specified backoff quantization (-b) but not probability quantization (-q)" << std::endl;
- abort();
- }
- if (optind + 1 == argc) {
- ShowSizes(argv[optind], config);
- return 0;
- }
- const char *model_type;
- const char *from_file;
-
- if (optind + 2 == argc) {
- model_type = "probing";
- from_file = argv[optind];
- config.write_mmap = argv[optind + 1];
- } else if (optind + 3 == argc) {
- model_type = argv[optind];
- from_file = argv[optind + 1];
- config.write_mmap = argv[optind + 2];
- } else {
- Usage(argv[0], default_mem);
- return 1;
- }
- if (!strcmp(model_type, "probing")) {
- if (!set_write_method) config.write_method = Config::WRITE_AFTER;
- if (quantize || set_backoff_bits) ProbingQuantizationUnsupported();
- if (rest) {
- RestProbingModel(from_file, config);
- } else {
- ProbingModel(from_file, config);
- }
- } else if (!strcmp(model_type, "trie")) {
- if (rest) {
- std::cerr << "Rest + trie is not supported yet." << std::endl;
- return 1;
- }
- if (!set_write_method) config.write_method = Config::WRITE_MMAP;
- if (quantize) {
- if (bhiksha) {
- QuantArrayTrieModel(from_file, config);
- } else {
- QuantTrieModel(from_file, config);
- }
- } else {
- if (bhiksha) {
- ArrayTrieModel(from_file, config);
- } else {
- TrieModel(from_file, config);
- }
- }
- } else {
- Usage(argv[0], default_mem);
- }
- }
- catch (const std::exception &e) {
- std::cerr << e.what() << std::endl;
- std::cerr << "ERROR" << std::endl;
- return 1;
- }
- std::cerr << "SUCCESS" << std::endl;
- return 0;
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/builder/CMakeLists.txt b/ext/kenlm/lm/builder/CMakeLists.txt
deleted file mode 100644
index cc0d3ed..0000000
--- a/ext/kenlm/lm/builder/CMakeLists.txt
+++ /dev/null
@@ -1,67 +0,0 @@
-cmake_minimum_required(VERSION 2.8.8)
-#
-# The KenLM cmake files make use of add_library(... OBJECTS ...)
-#
-# This syntax allows grouping of source files when compiling
-# (effectively creating "fake" libraries based on source subdirs).
-#
-# This syntax was only added in cmake version 2.8.8
-#
-# see http://www.cmake.org/Wiki/CMake/Tutorials/Object_Library
-
-
-# This CMake file was created by Lane Schwartz <do...@gmail.com>
-
-# Explicitly list the source files for this subdirectory
-#
-# If you add any source files to this subdirectory
-# that should be included in the kenlm library,
-# (this excludes any unit test files)
-# you should add them to the following list:
-#
-# In order to set correct paths to these files
-# in case this variable is referenced by CMake files in the parent directory,
-# we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}.
-#
-set(KENLM_BUILDER_SOURCE
- ${CMAKE_CURRENT_SOURCE_DIR}/adjust_counts.cc
- ${CMAKE_CURRENT_SOURCE_DIR}/corpus_count.cc
- ${CMAKE_CURRENT_SOURCE_DIR}/initial_probabilities.cc
- ${CMAKE_CURRENT_SOURCE_DIR}/interpolate.cc
- ${CMAKE_CURRENT_SOURCE_DIR}/output.cc
- ${CMAKE_CURRENT_SOURCE_DIR}/pipeline.cc
- )
-
-
-# Group these objects together for later use.
-#
-# Given add_library(foo OBJECT ${my_foo_sources}),
-# refer to these objects as $<TARGET_OBJECTS:foo>
-#
-add_library(kenlm_builder OBJECT ${KENLM_BUILDER_SOURCE})
-
-
-# Compile the executable, linking against the requisite dependent object files
-add_executable(lmplz lmplz_main.cc $<TARGET_OBJECTS:kenlm> $<TARGET_OBJECTS:kenlm_common> $<TARGET_OBJECTS:kenlm_builder> $<TARGET_OBJECTS:kenlm_util>)
-
-# Link the executable against boost
-target_link_libraries(lmplz ${Boost_LIBRARIES} pthread)
-
-# Group executables together
-set_target_properties(lmplz PROPERTIES FOLDER executables)
-
-if(BUILD_TESTING)
-
- # Explicitly list the Boost test files to be compiled
- set(KENLM_BOOST_TESTS_LIST
- adjust_counts_test
- corpus_count_test
- )
-
- AddTests(TESTS ${KENLM_BOOST_TESTS_LIST}
- DEPENDS $<TARGET_OBJECTS:kenlm>
- $<TARGET_OBJECTS:kenlm_common>
- $<TARGET_OBJECTS:kenlm_util>
- $<TARGET_OBJECTS:kenlm_builder>
- LIBRARIES ${Boost_LIBRARIES} pthread)
-endif()
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/Jamfile
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/builder/Jamfile b/ext/kenlm/lm/builder/Jamfile
deleted file mode 100644
index 329a8e0..0000000
--- a/ext/kenlm/lm/builder/Jamfile
+++ /dev/null
@@ -1,13 +0,0 @@
-fakelib builder : [ glob *.cc : *test.cc *main.cc ]
- ../../util//kenutil ../../util/stream//stream ../../util/double-conversion//double-conversion ..//kenlm ../common//common
- : : : <library>/top//boost_thread $(timer-link) ;
-
-exe lmplz : lmplz_main.cc builder /top//boost_program_options ;
-
-exe dump_counts : dump_counts_main.cc builder ;
-
-alias programs : lmplz dump_counts ;
-
-import testing ;
-unit-test corpus_count_test : corpus_count_test.cc builder /top//boost_unit_test_framework ;
-unit-test adjust_counts_test : adjust_counts_test.cc builder /top//boost_unit_test_framework ;
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/README.md
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/builder/README.md b/ext/kenlm/lm/builder/README.md
deleted file mode 100644
index be0d35e..0000000
--- a/ext/kenlm/lm/builder/README.md
+++ /dev/null
@@ -1,47 +0,0 @@
-Dependencies
-============
-
-Boost >= 1.42.0 is required.
-
-For Ubuntu,
-```bash
-sudo apt-get install libboost1.48-all-dev
-```
-
-Alternatively, you can download, compile, and install it yourself:
-
-```bash
-wget http://sourceforge.net/projects/boost/files/boost/1.52.0/boost_1_52_0.tar.gz/download -O boost_1_52_0.tar.gz
-tar -xvzf boost_1_52_0.tar.gz
-cd boost_1_52_0
-./bootstrap.sh
-./b2
-sudo ./b2 install
-```
-
-Local install options (in a user-space prefix directory) are also possible. See http://www.boost.org/doc/libs/1_52_0/doc/html/bbv2/installation.html.
-
-
-Building
-========
-
-```bash
-bjam
-```
-Your distribution might package bjam and boost-build separately from Boost. Both are required.
-
-Usage
-=====
-
-Run
-```bash
-$ bin/lmplz
-```
-to see command line arguments
-
-Running
-=======
-
-```bash
-bin/lmplz -o 5 <text >text.arpa
-```
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/TODO
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/builder/TODO b/ext/kenlm/lm/builder/TODO
deleted file mode 100644
index cb5aef3..0000000
--- a/ext/kenlm/lm/builder/TODO
+++ /dev/null
@@ -1,5 +0,0 @@
-More tests!
-Sharding.
-Some way to manage all the crazy config options.
-Option to build the binary file directly.
-Interpolation of different orders.
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/adjust_counts.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/builder/adjust_counts.cc b/ext/kenlm/lm/builder/adjust_counts.cc
deleted file mode 100644
index b4c5ba8..0000000
--- a/ext/kenlm/lm/builder/adjust_counts.cc
+++ /dev/null
@@ -1,353 +0,0 @@
-#include "lm/builder/adjust_counts.hh"
-#include "lm/common/ngram_stream.hh"
-#include "lm/builder/payload.hh"
-#include "util/stream/timer.hh"
-
-#include <algorithm>
-#include <iostream>
-#include <limits>
-
-namespace lm { namespace builder {
-
-BadDiscountException::BadDiscountException() throw() {}
-BadDiscountException::~BadDiscountException() throw() {}
-
-namespace {
-// Return last word in full that is different.
-const WordIndex* FindDifference(const NGram<BuildingPayload> &full, const NGram<BuildingPayload> &lower_last) {
- const WordIndex *cur_word = full.end() - 1;
- const WordIndex *pre_word = lower_last.end() - 1;
- // Find last difference.
- for (; pre_word >= lower_last.begin() && *pre_word == *cur_word; --cur_word, --pre_word) {}
- return cur_word;
-}
-
-class StatCollector {
- public:
- StatCollector(std::size_t order, std::vector<uint64_t> &counts, std::vector<uint64_t> &counts_pruned, std::vector<Discount> &discounts)
- : orders_(order), full_(orders_.back()), counts_(counts), counts_pruned_(counts_pruned), discounts_(discounts) {
- memset(&orders_[0], 0, sizeof(OrderStat) * order);
- }
-
- ~StatCollector() {}
-
- void CalculateDiscounts(const DiscountConfig &config) {
- counts_.resize(orders_.size());
- counts_pruned_.resize(orders_.size());
- for (std::size_t i = 0; i < orders_.size(); ++i) {
- const OrderStat &s = orders_[i];
- counts_[i] = s.count;
- counts_pruned_[i] = s.count_pruned;
- }
-
- discounts_ = config.overwrite;
- discounts_.resize(orders_.size());
- for (std::size_t i = config.overwrite.size(); i < orders_.size(); ++i) {
- const OrderStat &s = orders_[i];
- try {
- for (unsigned j = 1; j < 4; ++j) {
- // TODO: Specialize error message for j == 3, meaning 3+
- UTIL_THROW_IF(s.n[j] == 0, BadDiscountException, "Could not calculate Kneser-Ney discounts for "
- << (i+1) << "-grams with adjusted count " << (j+1) << " because we didn't observe any "
- << (i+1) << "-grams with adjusted count " << j << "; Is this small or artificial data?\n"
- << "Try deduplicating the input. To override this error for e.g. a class-based model, rerun with --discount_fallback\n");
- }
-
- // See equation (26) in Chen and Goodman.
- discounts_[i].amount[0] = 0.0;
- float y = static_cast<float>(s.n[1]) / static_cast<float>(s.n[1] + 2.0 * s.n[2]);
- for (unsigned j = 1; j < 4; ++j) {
- discounts_[i].amount[j] = static_cast<float>(j) - static_cast<float>(j + 1) * y * static_cast<float>(s.n[j+1]) / static_cast<float>(s.n[j]);
- UTIL_THROW_IF(discounts_[i].amount[j] < 0.0 || discounts_[i].amount[j] > j, BadDiscountException, "ERROR: " << (i+1) << "-gram discount out of range for adjusted count " << j << ": " << discounts_[i].amount[j]);
- }
- } catch (const BadDiscountException &e) {
- switch (config.bad_action) {
- case THROW_UP:
- throw;
- case COMPLAIN:
- std::cerr << "Substituting fallback discounts for order " << i << ": D1=" << config.fallback.amount[1] << " D2=" << config.fallback.amount[2] << " D3+=" << config.fallback.amount[3] << std::endl;
- case SILENT:
- break;
- }
- discounts_[i] = config.fallback;
- }
- }
- }
-
- void Add(std::size_t order_minus_1, uint64_t count, bool pruned = false) {
- OrderStat &stat = orders_[order_minus_1];
- ++stat.count;
- if (!pruned)
- ++stat.count_pruned;
- if (count < 5) ++stat.n[count];
- }
-
- void AddFull(uint64_t count, bool pruned = false) {
- ++full_.count;
- if (!pruned)
- ++full_.count_pruned;
- if (count < 5) ++full_.n[count];
- }
-
- private:
- struct OrderStat {
- // n_1 in equation 26 of Chen and Goodman etc
- uint64_t n[5];
- uint64_t count;
- uint64_t count_pruned;
- };
-
- std::vector<OrderStat> orders_;
- OrderStat &full_;
-
- std::vector<uint64_t> &counts_;
- std::vector<uint64_t> &counts_pruned_;
- std::vector<Discount> &discounts_;
-};
-
-// Reads all entries in order like NGramStream does.
-// But deletes any entries that have <s> in the 1st (not 0th) position on the
-// way out by putting other entries in their place. This disrupts the sort
-// order but we don't care because the data is going to be sorted again.
-class CollapseStream {
- public:
- CollapseStream(const util::stream::ChainPosition &position, uint64_t prune_threshold, const std::vector<bool>& prune_words) :
- current_(NULL, NGram<BuildingPayload>::OrderFromSize(position.GetChain().EntrySize())),
- prune_threshold_(prune_threshold),
- prune_words_(prune_words),
- block_(position) {
- StartBlock();
- }
-
- const NGram<BuildingPayload> &operator*() const { return current_; }
- const NGram<BuildingPayload> *operator->() const { return ¤t_; }
-
- operator bool() const { return block_; }
-
- CollapseStream &operator++() {
- assert(block_);
-
- if (current_.begin()[1] == kBOS && current_.Base() < copy_from_) {
- memcpy(current_.Base(), copy_from_, current_.TotalSize());
- UpdateCopyFrom();
-
- // Mark highest order n-grams for later pruning
- if(current_.Value().count <= prune_threshold_) {
- current_.Value().Mark();
- }
-
- if(!prune_words_.empty()) {
- for(WordIndex* i = current_.begin(); i != current_.end(); i++) {
- if(prune_words_[*i]) {
- current_.Value().Mark();
- break;
- }
- }
- }
-
- }
-
- current_.NextInMemory();
- uint8_t *block_base = static_cast<uint8_t*>(block_->Get());
- if (current_.Base() == block_base + block_->ValidSize()) {
- block_->SetValidSize(copy_from_ + current_.TotalSize() - block_base);
- ++block_;
- StartBlock();
- }
-
- // Mark highest order n-grams for later pruning
- if(current_.Value().count <= prune_threshold_) {
- current_.Value().Mark();
- }
-
- if(!prune_words_.empty()) {
- for(WordIndex* i = current_.begin(); i != current_.end(); i++) {
- if(prune_words_[*i]) {
- current_.Value().Mark();
- break;
- }
- }
- }
-
- return *this;
- }
-
- private:
- void StartBlock() {
- for (; ; ++block_) {
- if (!block_) return;
- if (block_->ValidSize()) break;
- }
- current_.ReBase(block_->Get());
- copy_from_ = static_cast<uint8_t*>(block_->Get()) + block_->ValidSize();
- UpdateCopyFrom();
-
- // Mark highest order n-grams for later pruning
- if(current_.Value().count <= prune_threshold_) {
- current_.Value().Mark();
- }
-
- if(!prune_words_.empty()) {
- for(WordIndex* i = current_.begin(); i != current_.end(); i++) {
- if(prune_words_[*i]) {
- current_.Value().Mark();
- break;
- }
- }
- }
-
- }
-
- // Find last without bos.
- void UpdateCopyFrom() {
- for (copy_from_ -= current_.TotalSize(); copy_from_ >= current_.Base(); copy_from_ -= current_.TotalSize()) {
- if (NGram<BuildingPayload>(copy_from_, current_.Order()).begin()[1] != kBOS) break;
- }
- }
-
- NGram<BuildingPayload> current_;
-
- // Goes backwards in the block
- uint8_t *copy_from_;
- uint64_t prune_threshold_;
- const std::vector<bool>& prune_words_;
- util::stream::Link block_;
-};
-
-} // namespace
-
-void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
- UTIL_TIMER("(%w s) Adjusted counts\n");
-
- const std::size_t order = positions.size();
- StatCollector stats(order, counts_, counts_pruned_, discounts_);
- if (order == 1) {
-
- // Only unigrams. Just collect stats.
- for (NGramStream<BuildingPayload> full(positions[0]); full; ++full) {
-
- // Do not prune <s> </s> <unk>
- if(*full->begin() > 2) {
- if(full->Value().count <= prune_thresholds_[0])
- full->Value().Mark();
-
- if(!prune_words_.empty() && prune_words_[*full->begin()])
- full->Value().Mark();
- }
-
- stats.AddFull(full->Value().UnmarkedCount(), full->Value().IsMarked());
- }
-
- stats.CalculateDiscounts(discount_config_);
- return;
- }
-
- NGramStreams<BuildingPayload> streams;
- streams.Init(positions, positions.size() - 1);
-
- CollapseStream full(positions[positions.size() - 1], prune_thresholds_.back(), prune_words_);
-
- // Initialization: <unk> has count 0 and so does <s>.
- NGramStream<BuildingPayload> *lower_valid = streams.begin();
- const NGramStream<BuildingPayload> *const streams_begin = streams.begin();
- streams[0]->Value().count = 0;
- *streams[0]->begin() = kUNK;
- stats.Add(0, 0);
- (++streams[0])->Value().count = 0;
- *streams[0]->begin() = kBOS;
- // <s> is not in stats yet because it will get put in later.
-
- // This keeps track of actual counts for lower orders. It is not output
- // (only adjusted counts are), but used to determine pruning.
- std::vector<uint64_t> actual_counts(positions.size(), 0);
- // Something of a hack: don't prune <s>.
- actual_counts[0] = std::numeric_limits<uint64_t>::max();
-
- // Iterate over full (the stream of the highest order ngrams)
- for (; full; ++full) {
- const WordIndex *different = FindDifference(*full, **lower_valid);
- std::size_t same = full->end() - 1 - different;
-
- // STEP 1: Output all the n-grams that changed.
- for (; lower_valid >= streams.begin() + same; --lower_valid) {
- uint64_t order_minus_1 = lower_valid - streams_begin;
- if(actual_counts[order_minus_1] <= prune_thresholds_[order_minus_1])
- (*lower_valid)->Value().Mark();
-
- if(!prune_words_.empty()) {
- for(WordIndex* i = (*lower_valid)->begin(); i != (*lower_valid)->end(); i++) {
- if(prune_words_[*i]) {
- (*lower_valid)->Value().Mark();
- break;
- }
- }
- }
-
- stats.Add(order_minus_1, (*lower_valid)->Value().UnmarkedCount(), (*lower_valid)->Value().IsMarked());
- ++*lower_valid;
- }
-
- // STEP 2: Update n-grams that still match.
- // n-grams that match get count from the full entry.
- for (std::size_t i = 0; i < same; ++i) {
- actual_counts[i] += full->Value().UnmarkedCount();
- }
- // Increment the number of unique extensions for the longest match.
- if (same) ++streams[same - 1]->Value().count;
-
- // STEP 3: Initialize new n-grams.
- // This is here because bos is also const WordIndex *, so copy gets
- // consistent argument types.
- const WordIndex *full_end = full->end();
- // Initialize and mark as valid up to bos.
- const WordIndex *bos;
- for (bos = different; (bos > full->begin()) && (*bos != kBOS); --bos) {
- NGramStream<BuildingPayload> &to = *++lower_valid;
- std::copy(bos, full_end, to->begin());
- to->Value().count = 1;
- actual_counts[lower_valid - streams_begin] = full->Value().UnmarkedCount();
- }
- // Now bos indicates where <s> is or is the 0th word of full.
- if (bos != full->begin()) {
- // There is an <s> beyond the 0th word.
- NGramStream<BuildingPayload> &to = *++lower_valid;
- std::copy(bos, full_end, to->begin());
-
- // Anything that begins with <s> has full non adjusted count.
- to->Value().count = full->Value().UnmarkedCount();
- actual_counts[lower_valid - streams_begin] = full->Value().UnmarkedCount();
- } else {
- stats.AddFull(full->Value().UnmarkedCount(), full->Value().IsMarked());
- }
- assert(lower_valid >= &streams[0]);
- }
-
- // The above loop outputs n-grams when it observes changes. This outputs
- // the last n-grams.
- for (NGramStream<BuildingPayload> *s = streams.begin(); s <= lower_valid; ++s) {
- uint64_t lower_count = actual_counts[(*s)->Order() - 1];
- if(lower_count <= prune_thresholds_[(*s)->Order() - 1])
- (*s)->Value().Mark();
-
- if(!prune_words_.empty()) {
- for(WordIndex* i = (*s)->begin(); i != (*s)->end(); i++) {
- if(prune_words_[*i]) {
- (*s)->Value().Mark();
- break;
- }
- }
- }
-
- stats.Add(s - streams.begin(), lower_count, (*s)->Value().IsMarked());
- ++*s;
- }
- // Poison everyone! Except the N-grams which were already poisoned by the input.
- for (NGramStream<BuildingPayload> *s = streams.begin(); s != streams.end(); ++s)
- s->Poison();
-
- stats.CalculateDiscounts(discount_config_);
-
- // NOTE: See special early-return case for unigrams near the top of this function
-}
-
-}} // namespaces
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/adjust_counts.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/builder/adjust_counts.hh b/ext/kenlm/lm/builder/adjust_counts.hh
deleted file mode 100644
index 29319ba..0000000
--- a/ext/kenlm/lm/builder/adjust_counts.hh
+++ /dev/null
@@ -1,72 +0,0 @@
-#ifndef LM_BUILDER_ADJUST_COUNTS_H
-#define LM_BUILDER_ADJUST_COUNTS_H
-
-#include "lm/builder/discount.hh"
-#include "lm/lm_exception.hh"
-#include "util/exception.hh"
-
-#include <vector>
-
-#include <stdint.h>
-
-namespace util { namespace stream { class ChainPositions; } }
-
-namespace lm {
-namespace builder {
-
-class BadDiscountException : public util::Exception {
- public:
- BadDiscountException() throw();
- ~BadDiscountException() throw();
-};
-
-struct DiscountConfig {
- // Overrides discounts for orders [1,discount_override.size()].
- std::vector<Discount> overwrite;
- // If discounting fails for an order, copy them from here.
- Discount fallback;
- // What to do when discounts are out of range or would trigger divison by
- // zero. It it does something other than THROW_UP, use fallback_discount.
- WarningAction bad_action;
-};
-
-/* Compute adjusted counts.
- * Input: unique suffix sorted N-grams (and just the N-grams) with raw counts.
- * Output: [1,N]-grams with adjusted counts.
- * [1,N)-grams are in suffix order
- * N-grams are in undefined order (they're going to be sorted anyway).
- */
-class AdjustCounts {
- public:
- // counts: output
- // counts_pruned: output
- // discounts: mostly output. If the input already has entries, they will be kept.
- // prune_thresholds: input. n-grams with normal (not adjusted) count below this will be pruned.
- AdjustCounts(
- const std::vector<uint64_t> &prune_thresholds,
- std::vector<uint64_t> &counts,
- std::vector<uint64_t> &counts_pruned,
- const std::vector<bool> &prune_words,
- const DiscountConfig &discount_config,
- std::vector<Discount> &discounts)
- : prune_thresholds_(prune_thresholds), counts_(counts), counts_pruned_(counts_pruned),
- prune_words_(prune_words), discount_config_(discount_config), discounts_(discounts)
- {}
-
- void Run(const util::stream::ChainPositions &positions);
-
- private:
- const std::vector<uint64_t> &prune_thresholds_;
- std::vector<uint64_t> &counts_;
- std::vector<uint64_t> &counts_pruned_;
- const std::vector<bool> &prune_words_;
-
- DiscountConfig discount_config_;
- std::vector<Discount> &discounts_;
-};
-
-} // namespace builder
-} // namespace lm
-
-#endif // LM_BUILDER_ADJUST_COUNTS_H
-
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/adjust_counts_test.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/builder/adjust_counts_test.cc b/ext/kenlm/lm/builder/adjust_counts_test.cc
deleted file mode 100644
index fff551f..0000000
--- a/ext/kenlm/lm/builder/adjust_counts_test.cc
+++ /dev/null
@@ -1,112 +0,0 @@
-#include "lm/builder/adjust_counts.hh"
-
-#include "lm/common/ngram_stream.hh"
-#include "lm/builder/payload.hh"
-#include "util/scoped.hh"
-
-#include <boost/thread/thread.hpp>
-#define BOOST_TEST_MODULE AdjustCounts
-#include <boost/test/unit_test.hpp>
-
-namespace lm { namespace builder { namespace {
-
-class KeepCopy {
- public:
- KeepCopy() : size_(0) {}
-
- void Run(const util::stream::ChainPosition &position) {
- for (util::stream::Link link(position); link; ++link) {
- mem_.call_realloc(size_ + link->ValidSize());
- memcpy(static_cast<uint8_t*>(mem_.get()) + size_, link->Get(), link->ValidSize());
- size_ += link->ValidSize();
- }
- }
-
- uint8_t *Get() { return static_cast<uint8_t*>(mem_.get()); }
- std::size_t Size() const { return size_; }
-
- private:
- util::scoped_malloc mem_;
- std::size_t size_;
-};
-
-struct Gram4 {
- WordIndex ids[4];
- uint64_t count;
-};
-
-class WriteInput {
- public:
- void Run(const util::stream::ChainPosition &position) {
- NGramStream<BuildingPayload> input(position);
- Gram4 grams[] = {
- {{0,0,0,0},10},
- {{0,0,3,0},3},
- // bos
- {{1,1,1,2},5},
- {{0,0,3,2},5},
- };
- for (size_t i = 0; i < sizeof(grams) / sizeof(Gram4); ++i, ++input) {
- memcpy(input->begin(), grams[i].ids, sizeof(WordIndex) * 4);
- input->Value().count = grams[i].count;
- }
- input.Poison();
- }
-};
-
-BOOST_AUTO_TEST_CASE(Simple) {
- KeepCopy outputs[4];
- std::vector<uint64_t> counts;
- std::vector<Discount> discount;
- {
- util::stream::ChainConfig config;
- config.total_memory = 100;
- config.block_count = 1;
- util::stream::Chains chains(4);
- for (unsigned i = 0; i < 4; ++i) {
- config.entry_size = NGram<BuildingPayload>::TotalSize(i + 1);
- chains.push_back(config);
- }
-
- chains[3] >> WriteInput();
- util::stream::ChainPositions for_adjust(chains);
- for (unsigned i = 0; i < 4; ++i) {
- chains[i] >> boost::ref(outputs[i]);
- }
- chains >> util::stream::kRecycle;
- std::vector<uint64_t> counts_pruned(4);
- std::vector<uint64_t> prune_thresholds(4);
- DiscountConfig discount_config;
- discount_config.fallback = Discount();
- discount_config.bad_action = THROW_UP;
- BOOST_CHECK_THROW(AdjustCounts(prune_thresholds, counts, counts_pruned, std::vector<bool>(), discount_config, discount).Run(for_adjust), BadDiscountException);
- }
- BOOST_REQUIRE_EQUAL(4UL, counts.size());
- BOOST_CHECK_EQUAL(4UL, counts[0]);
- // These are no longer set because the discounts are bad.
-/* BOOST_CHECK_EQUAL(4UL, counts[1]);
- BOOST_CHECK_EQUAL(3UL, counts[2]);
- BOOST_CHECK_EQUAL(3UL, counts[3]);*/
- BOOST_REQUIRE_EQUAL(NGram<BuildingPayload>::TotalSize(1) * 4, outputs[0].Size());
- NGram<BuildingPayload> uni(outputs[0].Get(), 1);
- BOOST_CHECK_EQUAL(kUNK, *uni.begin());
- BOOST_CHECK_EQUAL(0ULL, uni.Value().count);
- uni.NextInMemory();
- BOOST_CHECK_EQUAL(kBOS, *uni.begin());
- BOOST_CHECK_EQUAL(0ULL, uni.Value().count);
- uni.NextInMemory();
- BOOST_CHECK_EQUAL(0UL, *uni.begin());
- BOOST_CHECK_EQUAL(2ULL, uni.Value().count);
- uni.NextInMemory();
- BOOST_CHECK_EQUAL(2ULL, uni.Value().count);
- BOOST_CHECK_EQUAL(2UL, *uni.begin());
-
- BOOST_REQUIRE_EQUAL(NGram<BuildingPayload>::TotalSize(2) * 4, outputs[1].Size());
- NGram<BuildingPayload> bi(outputs[1].Get(), 2);
- BOOST_CHECK_EQUAL(0UL, *bi.begin());
- BOOST_CHECK_EQUAL(0UL, *(bi.begin() + 1));
- BOOST_CHECK_EQUAL(1ULL, bi.Value().count);
- bi.NextInMemory();
-}
-
-}}} // namespaces
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/combine_counts.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/builder/combine_counts.hh b/ext/kenlm/lm/builder/combine_counts.hh
deleted file mode 100644
index 2eda517..0000000
--- a/ext/kenlm/lm/builder/combine_counts.hh
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef LM_BUILDER_COMBINE_COUNTS_H
-#define LM_BUILDER_COMBINE_COUNTS_H
-
-#include "lm/builder/payload.hh"
-#include "lm/common/ngram.hh"
-#include "lm/common/compare.hh"
-#include "lm/word_index.hh"
-#include "util/stream/sort.hh"
-
-#include <functional>
-#include <string>
-
-namespace lm {
-namespace builder {
-
-// Sum counts for the same n-gram.
-struct CombineCounts {
- bool operator()(void *first_void, const void *second_void, const SuffixOrder &compare) const {
- NGram<BuildingPayload> first(first_void, compare.Order());
- // There isn't a const version of NGram.
- NGram<BuildingPayload> second(const_cast<void*>(second_void), compare.Order());
- if (memcmp(first.begin(), second.begin(), sizeof(WordIndex) * compare.Order())) return false;
- first.Value().count += second.Value().count;
- return true;
- }
-};
-
-} // namespace builder
-} // namespace lm
-
-#endif // LM_BUILDER_COMBINE_COUNTS_H
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/corpus_count.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/builder/corpus_count.cc b/ext/kenlm/lm/builder/corpus_count.cc
deleted file mode 100644
index 0414c22..0000000
--- a/ext/kenlm/lm/builder/corpus_count.cc
+++ /dev/null
@@ -1,239 +0,0 @@
-#include "lm/builder/corpus_count.hh"
-
-#include "lm/builder/payload.hh"
-#include "lm/common/ngram.hh"
-#include "lm/lm_exception.hh"
-#include "lm/vocab.hh"
-#include "lm/word_index.hh"
-#include "util/file_stream.hh"
-#include "util/file.hh"
-#include "util/file_piece.hh"
-#include "util/murmur_hash.hh"
-#include "util/probing_hash_table.hh"
-#include "util/scoped.hh"
-#include "util/stream/chain.hh"
-#include "util/stream/timer.hh"
-#include "util/tokenize_piece.hh"
-
-#include <functional>
-
-#include <stdint.h>
-
-namespace lm {
-namespace builder {
-namespace {
-
-class DedupeHash : public std::unary_function<const WordIndex *, bool> {
- public:
- explicit DedupeHash(std::size_t order) : size_(order * sizeof(WordIndex)) {}
-
- std::size_t operator()(const WordIndex *start) const {
- return util::MurmurHashNative(start, size_);
- }
-
- private:
- const std::size_t size_;
-};
-
-class DedupeEquals : public std::binary_function<const WordIndex *, const WordIndex *, bool> {
- public:
- explicit DedupeEquals(std::size_t order) : size_(order * sizeof(WordIndex)) {}
-
- bool operator()(const WordIndex *first, const WordIndex *second) const {
- return !memcmp(first, second, size_);
- }
-
- private:
- const std::size_t size_;
-};
-
-struct DedupeEntry {
- typedef WordIndex *Key;
- Key GetKey() const { return key; }
- void SetKey(WordIndex *to) { key = to; }
- Key key;
- static DedupeEntry Construct(WordIndex *at) {
- DedupeEntry ret;
- ret.key = at;
- return ret;
- }
-};
-
-
-// TODO: don't have this here, should be with probing hash table defaults?
-const float kProbingMultiplier = 1.5;
-
-typedef util::ProbingHashTable<DedupeEntry, DedupeHash, DedupeEquals> Dedupe;
-
-class Writer {
- public:
- Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size)
- : block_(position), gram_(block_->Get(), order),
- dedupe_invalid_(order, std::numeric_limits<WordIndex>::max()),
- dedupe_(dedupe_mem, dedupe_mem_size, &dedupe_invalid_[0], DedupeHash(order), DedupeEquals(order)),
- buffer_(new WordIndex[order - 1]),
- block_size_(position.GetChain().BlockSize()) {
- dedupe_.Clear();
- assert(Dedupe::Size(position.GetChain().BlockSize() / position.GetChain().EntrySize(), kProbingMultiplier) == dedupe_mem_size);
- if (order == 1) {
- // Add special words. AdjustCounts is responsible if order != 1.
- AddUnigramWord(kUNK);
- AddUnigramWord(kBOS);
- }
- }
-
- ~Writer() {
- block_->SetValidSize(reinterpret_cast<const uint8_t*>(gram_.begin()) - static_cast<const uint8_t*>(block_->Get()));
- (++block_).Poison();
- }
-
- // Write context with a bunch of <s>
- void StartSentence() {
- for (WordIndex *i = gram_.begin(); i != gram_.end() - 1; ++i) {
- *i = kBOS;
- }
- }
-
- void Append(WordIndex word) {
- *(gram_.end() - 1) = word;
- Dedupe::MutableIterator at;
- bool found = dedupe_.FindOrInsert(DedupeEntry::Construct(gram_.begin()), at);
- if (found) {
- // Already present.
- NGram<BuildingPayload> already(at->key, gram_.Order());
- ++(already.Value().count);
- // Shift left by one.
- memmove(gram_.begin(), gram_.begin() + 1, sizeof(WordIndex) * (gram_.Order() - 1));
- return;
- }
- // Complete the write.
- gram_.Value().count = 1;
- // Prepare the next n-gram.
- if (reinterpret_cast<uint8_t*>(gram_.begin()) + gram_.TotalSize() != static_cast<uint8_t*>(block_->Get()) + block_size_) {
- NGram<BuildingPayload> last(gram_);
- gram_.NextInMemory();
- std::copy(last.begin() + 1, last.end(), gram_.begin());
- return;
- }
- // Block end. Need to store the context in a temporary buffer.
- std::copy(gram_.begin() + 1, gram_.end(), buffer_.get());
- dedupe_.Clear();
- block_->SetValidSize(block_size_);
- gram_.ReBase((++block_)->Get());
- std::copy(buffer_.get(), buffer_.get() + gram_.Order() - 1, gram_.begin());
- }
-
- private:
- void AddUnigramWord(WordIndex index) {
- *gram_.begin() = index;
- gram_.Value().count = 0;
- gram_.NextInMemory();
- if (gram_.Base() == static_cast<uint8_t*>(block_->Get()) + block_size_) {
- block_->SetValidSize(block_size_);
- gram_.ReBase((++block_)->Get());
- }
- }
-
- util::stream::Link block_;
-
- NGram<BuildingPayload> gram_;
-
- // This is the memory behind the invalid value in dedupe_.
- std::vector<WordIndex> dedupe_invalid_;
- // Hash table combiner implementation.
- Dedupe dedupe_;
-
- // Small buffer to hold existing ngrams when shifting across a block boundary.
- boost::scoped_array<WordIndex> buffer_;
-
- const std::size_t block_size_;
-};
-
-} // namespace
-
-float CorpusCount::DedupeMultiplier(std::size_t order) {
- return kProbingMultiplier * static_cast<float>(sizeof(DedupeEntry)) / static_cast<float>(NGram<BuildingPayload>::TotalSize(order));
-}
-
-std::size_t CorpusCount::VocabUsage(std::size_t vocab_estimate) {
- return ngram::GrowableVocab<ngram::WriteUniqueWords>::MemUsage(vocab_estimate);
-}
-
-CorpusCount::CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::vector<bool> &prune_words, const std::string& prune_vocab_filename, std::size_t entries_per_block, WarningAction disallowed_symbol)
- : from_(from), vocab_write_(vocab_write), token_count_(token_count), type_count_(type_count),
- prune_words_(prune_words), prune_vocab_filename_(prune_vocab_filename),
- dedupe_mem_size_(Dedupe::Size(entries_per_block, kProbingMultiplier)),
- dedupe_mem_(util::MallocOrThrow(dedupe_mem_size_)),
- disallowed_symbol_action_(disallowed_symbol) {
-}
-
-namespace {
- void ComplainDisallowed(StringPiece word, WarningAction &action) {
- switch (action) {
- case SILENT:
- return;
- case COMPLAIN:
- std::cerr << "Warning: " << word << " appears in the input. All instances of <s>, </s>, and <unk> will be interpreted as whitespace." << std::endl;
- action = SILENT;
- return;
- case THROW_UP:
- UTIL_THROW(FormatLoadException, "Special word " << word << " is not allowed in the corpus. I plan to support models containing <unk> in the future. Pass --skip_symbols to convert these symbols to whitespace.");
- }
- }
-} // namespace
-
-void CorpusCount::Run(const util::stream::ChainPosition &position) {
- ngram::GrowableVocab<ngram::WriteUniqueWords> vocab(type_count_, vocab_write_);
- token_count_ = 0;
- type_count_ = 0;
- const WordIndex end_sentence = vocab.FindOrInsert("</s>");
- Writer writer(NGram<BuildingPayload>::OrderFromSize(position.GetChain().EntrySize()), position, dedupe_mem_.get(), dedupe_mem_size_);
- uint64_t count = 0;
- bool delimiters[256];
- util::BoolCharacter::Build("\0\t\n\r ", delimiters);
- try {
- while(true) {
- StringPiece line(from_.ReadLine());
- writer.StartSentence();
- for (util::TokenIter<util::BoolCharacter, true> w(line, delimiters); w; ++w) {
- WordIndex word = vocab.FindOrInsert(*w);
- if (word <= 2) {
- ComplainDisallowed(*w, disallowed_symbol_action_);
- continue;
- }
- writer.Append(word);
- ++count;
- }
- writer.Append(end_sentence);
- }
- } catch (const util::EndOfFileException &e) {}
- token_count_ = count;
- type_count_ = vocab.Size();
-
- // Create list of unigrams that are supposed to be pruned
- if (!prune_vocab_filename_.empty()) {
- try {
- util::FilePiece prune_vocab_file(prune_vocab_filename_.c_str());
-
- prune_words_.resize(vocab.Size(), true);
- try {
- while (true) {
- StringPiece word(prune_vocab_file.ReadDelimited(delimiters));
- prune_words_[vocab.Index(word)] = false;
- }
- } catch (const util::EndOfFileException &e) {}
-
- // Never prune <unk>, <s>, </s>
- prune_words_[kUNK] = false;
- prune_words_[kBOS] = false;
- prune_words_[kEOS] = false;
-
- } catch (const util::Exception &e) {
- std::cerr << e.what() << std::endl;
- abort();
- }
- }
-}
-
-} // namespace builder
-} // namespace lm
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/corpus_count.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/builder/corpus_count.hh b/ext/kenlm/lm/builder/corpus_count.hh
deleted file mode 100644
index 165505c..0000000
--- a/ext/kenlm/lm/builder/corpus_count.hh
+++ /dev/null
@@ -1,53 +0,0 @@
-#ifndef LM_BUILDER_CORPUS_COUNT_H
-#define LM_BUILDER_CORPUS_COUNT_H
-
-#include "lm/lm_exception.hh"
-#include "lm/word_index.hh"
-#include "util/scoped.hh"
-
-#include <cstddef>
-#include <string>
-#include <stdint.h>
-#include <vector>
-
-namespace util {
-class FilePiece;
-namespace stream {
-class ChainPosition;
-} // namespace stream
-} // namespace util
-
-namespace lm {
-namespace builder {
-
-class CorpusCount {
- public:
- // Memory usage will be DedupeMultipler(order) * block_size + total_chain_size + unknown vocab_hash_size
- static float DedupeMultiplier(std::size_t order);
-
- // How much memory vocabulary will use based on estimated size of the vocab.
- static std::size_t VocabUsage(std::size_t vocab_estimate);
-
- // token_count: out.
- // type_count aka vocabulary size. Initialize to an estimate. It is set to the exact value.
- CorpusCount(util::FilePiece &from, int vocab_write, uint64_t &token_count, WordIndex &type_count, std::vector<bool> &prune_words, const std::string& prune_vocab_filename, std::size_t entries_per_block, WarningAction disallowed_symbol);
-
- void Run(const util::stream::ChainPosition &position);
-
- private:
- util::FilePiece &from_;
- int vocab_write_;
- uint64_t &token_count_;
- WordIndex &type_count_;
- std::vector<bool>& prune_words_;
- const std::string& prune_vocab_filename_;
-
- std::size_t dedupe_mem_size_;
- util::scoped_malloc dedupe_mem_;
-
- WarningAction disallowed_symbol_action_;
-};
-
-} // namespace builder
-} // namespace lm
-#endif // LM_BUILDER_CORPUS_COUNT_H
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/corpus_count_test.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/builder/corpus_count_test.cc b/ext/kenlm/lm/builder/corpus_count_test.cc
deleted file mode 100644
index 88bcf96..0000000
--- a/ext/kenlm/lm/builder/corpus_count_test.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-#include "lm/builder/corpus_count.hh"
-
-#include "lm/builder/payload.hh"
-#include "lm/common/ngram_stream.hh"
-#include "lm/common/ngram.hh"
-
-#include "util/file.hh"
-#include "util/file_piece.hh"
-#include "util/tokenize_piece.hh"
-#include "util/stream/chain.hh"
-#include "util/stream/stream.hh"
-
-#define BOOST_TEST_MODULE CorpusCountTest
-#include <boost/test/unit_test.hpp>
-
-namespace lm { namespace builder { namespace {
-
-#define Check(str, cnt) { \
- BOOST_REQUIRE(stream); \
- w = stream->begin(); \
- for (util::TokenIter<util::AnyCharacter, true> t(str, " "); t; ++t, ++w) { \
- BOOST_CHECK_EQUAL(*t, v[*w]); \
- } \
- BOOST_CHECK_EQUAL((uint64_t)cnt, stream->Value().count); \
- ++stream; \
-}
-
-BOOST_AUTO_TEST_CASE(Short) {
- util::scoped_fd input_file(util::MakeTemp("corpus_count_test_temp"));
- const char input[] = "looking on a little more loin\non a little more loin\non foo little more loin\nbar\n\n";
- // Blocks of 10 are
- // looking on a little more loin </s> on a little[duplicate] more[duplicate] loin[duplicate] </s>[duplicate] on[duplicate] foo
- // little more loin </s> bar </s> </s>
-
- util::WriteOrThrow(input_file.get(), input, sizeof(input) - 1);
- util::FilePiece input_piece(input_file.release(), "temp file");
-
- util::stream::ChainConfig config;
- config.entry_size = NGram<BuildingPayload>::TotalSize(3);
- config.total_memory = config.entry_size * 20;
- config.block_count = 2;
-
- util::scoped_fd vocab(util::MakeTemp("corpus_count_test_vocab"));
-
- util::stream::Chain chain(config);
- uint64_t token_count;
- WordIndex type_count = 10;
- std::vector<bool> prune_words;
- CorpusCount counter(input_piece, vocab.get(), token_count, type_count, prune_words, "", chain.BlockSize() / chain.EntrySize(), SILENT);
- chain >> boost::ref(counter);
- NGramStream<BuildingPayload> stream(chain.Add());
- chain >> util::stream::kRecycle;
-
- const char *v[] = {"<unk>", "<s>", "</s>", "looking", "on", "a", "little", "more", "loin", "foo", "bar"};
-
- WordIndex *w;
-
- Check("<s> <s> looking", 1);
- Check("<s> looking on", 1);
- Check("looking on a", 1);
- Check("on a little", 2);
- Check("a little more", 2);
- Check("little more loin", 2);
- Check("more loin </s>", 2);
- Check("<s> <s> on", 2);
- Check("<s> on a", 1);
- Check("<s> on foo", 1);
- Check("on foo little", 1);
- Check("foo little more", 1);
- Check("little more loin", 1);
- Check("more loin </s>", 1);
- Check("<s> <s> bar", 1);
- Check("<s> bar </s>", 1);
- Check("<s> <s> </s>", 1);
- BOOST_CHECK(!stream);
- BOOST_CHECK_EQUAL(sizeof(v) / sizeof(const char*), type_count);
-}
-
-}}} // namespaces
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/debug_print.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/builder/debug_print.hh b/ext/kenlm/lm/builder/debug_print.hh
deleted file mode 100644
index 4b9f306..0000000
--- a/ext/kenlm/lm/builder/debug_print.hh
+++ /dev/null
@@ -1,70 +0,0 @@
-#ifndef LM_BUILDER_DEBUG_PRINT_H
-#define LM_BUILDER_DEBUG_PRINT_H
-
-#include "lm/builder/payload.hh"
-#include "lm/common/print.hh"
-#include "lm/common/ngram_stream.hh"
-#include "util/file_stream.hh"
-#include "util/file.hh"
-
-#include <boost/lexical_cast.hpp>
-
-namespace lm { namespace builder {
-// Not defined, only specialized.
-template <class T> void PrintPayload(util::FileStream &to, const BuildingPayload &payload);
-template <> inline void PrintPayload<uint64_t>(util::FileStream &to, const BuildingPayload &payload) {
- to << payload.count;
-}
-template <> inline void PrintPayload<Uninterpolated>(util::FileStream &to, const BuildingPayload &payload) {
- to << log10(payload.uninterp.prob) << ' ' << log10(payload.uninterp.gamma);
-}
-template <> inline void PrintPayload<ProbBackoff>(util::FileStream &to, const BuildingPayload &payload) {
- to << payload.complete.prob << ' ' << payload.complete.backoff;
-}
-
-// template parameter is the type stored.
-template <class V> class Print {
- public:
- static void DumpSeparateFiles(const VocabReconstitute &vocab, const std::string &file_base, util::stream::Chains &chains) {
- for (unsigned int i = 0; i < chains.size(); ++i) {
- std::string file(file_base + boost::lexical_cast<std::string>(i));
- chains[i] >> Print(vocab, util::CreateOrThrow(file.c_str()));
- }
- }
-
- explicit Print(const VocabReconstitute &vocab, int fd) : vocab_(vocab), to_(fd) {}
-
- void Run(const util::stream::ChainPositions &chains) {
- util::scoped_fd fd(to_);
- util::FileStream out(to_);
- NGramStreams<BuildingPayload> streams(chains);
- for (NGramStream<BuildingPayload> *s = streams.begin(); s != streams.end(); ++s) {
- DumpStream(*s, out);
- }
- }
-
- void Run(const util::stream::ChainPosition &position) {
- util::scoped_fd fd(to_);
- util::FileStream out(to_);
- NGramStream<BuildingPayload> stream(position);
- DumpStream(stream, out);
- }
-
- private:
- void DumpStream(NGramStream<BuildingPayload> &stream, util::FileStream &to) {
- for (; stream; ++stream) {
- PrintPayload<V>(to, stream->Value());
- for (const WordIndex *w = stream->begin(); w != stream->end(); ++w) {
- to << ' ' << vocab_.Lookup(*w) << '=' << *w;
- }
- to << '\n';
- }
- }
-
- const VocabReconstitute &vocab_;
- int to_;
-};
-
-}} // namespaces
-
-#endif // LM_BUILDER_DEBUG_PRINT_H
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/discount.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/builder/discount.hh b/ext/kenlm/lm/builder/discount.hh
deleted file mode 100644
index e2f4084..0000000
--- a/ext/kenlm/lm/builder/discount.hh
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef LM_BUILDER_DISCOUNT_H
-#define LM_BUILDER_DISCOUNT_H
-
-#include <algorithm>
-
-#include <stdint.h>
-
-namespace lm {
-namespace builder {
-
-struct Discount {
- float amount[4];
-
- float Get(uint64_t count) const {
- return amount[std::min<uint64_t>(count, 3)];
- }
-
- float Apply(uint64_t count) const {
- return static_cast<float>(count) - Get(count);
- }
-};
-
-} // namespace builder
-} // namespace lm
-
-#endif // LM_BUILDER_DISCOUNT_H
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/dump_counts_main.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/builder/dump_counts_main.cc b/ext/kenlm/lm/builder/dump_counts_main.cc
deleted file mode 100644
index 26078d0..0000000
--- a/ext/kenlm/lm/builder/dump_counts_main.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-#include "lm/common/print.hh"
-#include "lm/word_index.hh"
-#include "util/file.hh"
-#include "util/read_compressed.hh"
-
-#include <boost/lexical_cast.hpp>
-
-#include <iostream>
-#include <vector>
-
-int main(int argc, char *argv[]) {
- if (argc != 4) {
- std::cerr << "Usage: " << argv[0] << " counts vocabulary order\n"
- "The counts file contains records with 4-byte vocabulary ids followed by 8-byte\n"
- "counts. Each record has order many vocabulary ids.\n"
- "The vocabulary file contains the words delimited by NULL in order of id.\n"
- "The vocabulary file may not be compressed because it is mmapped but the counts\n"
- "file can be compressed.\n";
- return 1;
- }
- util::ReadCompressed counts(util::OpenReadOrThrow(argv[1]));
- util::scoped_fd vocab_file(util::OpenReadOrThrow(argv[2]));
- lm::VocabReconstitute vocab(vocab_file.get());
- unsigned int order = boost::lexical_cast<unsigned int>(argv[3]);
- std::vector<char> record(sizeof(uint32_t) * order + sizeof(uint64_t));
- while (std::size_t got = counts.ReadOrEOF(&*record.begin(), record.size())) {
- UTIL_THROW_IF(got != record.size(), util::Exception, "Read " << got << " bytes at the end of file, which is not a complete record of length " << record.size());
- const lm::WordIndex *words = reinterpret_cast<const lm::WordIndex*>(&*record.begin());
- for (const lm::WordIndex *i = words; i != words + order; ++i) {
- UTIL_THROW_IF(*i >= vocab.Size(), util::Exception, "Vocab ID " << *i << " is larger than the vocab file's maximum of " << vocab.Size() << ". Are you sure you have the right order and vocab file for these counts?");
- std::cout << vocab.Lookup(*i) << ' ';
- }
- // TODO don't use std::cout because it is slow. Add fast uint64_t printing support to FileStream.
- std::cout << *reinterpret_cast<const uint64_t*>(words + order) << '\n';
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/hash_gamma.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/builder/hash_gamma.hh b/ext/kenlm/lm/builder/hash_gamma.hh
deleted file mode 100644
index 4bef47e..0000000
--- a/ext/kenlm/lm/builder/hash_gamma.hh
+++ /dev/null
@@ -1,19 +0,0 @@
-#ifndef LM_BUILDER_HASH_GAMMA__
-#define LM_BUILDER_HASH_GAMMA__
-
-#include <stdint.h>
-
-namespace lm { namespace builder {
-
-#pragma pack(push)
-#pragma pack(4)
-
-struct HashGamma {
- uint64_t hash_value;
- float gamma;
-};
-
-#pragma pack(pop)
-
-}} // namespaces
-#endif // LM_BUILDER_HASH_GAMMA__
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/builder/header_info.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/builder/header_info.hh b/ext/kenlm/lm/builder/header_info.hh
deleted file mode 100644
index d01d049..0000000
--- a/ext/kenlm/lm/builder/header_info.hh
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef LM_BUILDER_HEADER_INFO_H
-#define LM_BUILDER_HEADER_INFO_H
-
-#include <string>
-#include <vector>
-#include <stdint.h>
-
-namespace lm { namespace builder {
-
-// Some configuration info that is used to add
-// comments to the beginning of an ARPA file
-struct HeaderInfo {
- std::string input_file;
- uint64_t token_count;
- std::vector<uint64_t> counts_pruned;
-
- HeaderInfo() {}
-
- HeaderInfo(const std::string& input_file_in, uint64_t token_count_in, const std::vector<uint64_t> &counts_pruned_in)
- : input_file(input_file_in), token_count(token_count_in), counts_pruned(counts_pruned_in) {}
-
- // TODO: Add smoothing type
- // TODO: More info if multiple models were interpolated
-};
-
-}} // namespaces
-
-#endif