You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/19 21:34:00 UTC
[12/51] [partial] incubator-joshua git commit: Converted KenLM into a
submodule
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/CMakeLists.txt b/ext/kenlm/lm/interpolate/CMakeLists.txt
deleted file mode 100644
index c146ab6..0000000
--- a/ext/kenlm/lm/interpolate/CMakeLists.txt
+++ /dev/null
@@ -1,54 +0,0 @@
-find_package(Eigen3 REQUIRED)
-include_directories(${EIGEN3_INCLUDE_DIR})
-
-set(KENLM_INTERPOLATE_SOURCE
- backoff_reunification.cc
- bounded_sequence_encoding.cc
- enumerate_global_vocab.cc
- merge_probabilities.cc
- merge_vocab.cc
- normalize.cc
- pipeline.cc
- split_worker.cc
- tune_derivatives.cc
- tune_instance.cc
- universal_vocab.cc)
-
-add_library(kenlm_interpolate OBJECT ${KENLM_INTERPOLATE_SOURCE})
-
-set(KENLM_INTERPOLATE_EXES
- interpolate
- perf_enum_gv
- streaming_example
- train_params
- tune)
-
-set(KENLM_INTERPOLATE_DEPENDS
- $<TARGET_OBJECTS:kenlm>
- $<TARGET_OBJECTS:kenlm_util>
- $<TARGET_OBJECTS:kenlm_common>
- $<TARGET_OBJECTS:kenlm_interpolate>)
-
-AddExes(EXES ${KENLM_INTERPOLATE_EXES}
- DEPENDS ${KENLM_INTERPOLATE_DEPENDS}
- LIBRARIES ${Boost_LIBRARIES} pthread)
-
-if(BUILD_TESTING)
- set(KENLM_INTERPOLATE_TESTS
- backoff_reunification_test
- bounded_sequence_encoding_test
- merge_vocab_test
- normalize_test
- tune_derivatives_test)
-
- AddTests(TESTS ${KENLM_INTERPOLATE_TESTS}
- DEPENDS ${KENLM_INTERPOLATE_DEPENDS}
- LIBRARIES ${Boost_LIBRARIES} pthread)
-
- # tune_instance_test needs an extra command line parameter
- KenLMAddTest(TEST tune_instance_test
- DEPENDS ${KENLM_INTERPOLATE_DEPENDS}
- LIBRARIES ${Boost_LIBRARIES} pthread
- TEST_ARGS
- ${CMAKE_CURRENT_SOURCE_DIR}/tune_instance_data/toy0.1)
-endif()
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/Jamfile
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/Jamfile b/ext/kenlm/lm/interpolate/Jamfile
deleted file mode 100644
index 411a346..0000000
--- a/ext/kenlm/lm/interpolate/Jamfile
+++ /dev/null
@@ -1,22 +0,0 @@
-fakelib interp : ../common//common [ glob *.cc : *_main.cc *_test.cc tune_*.cc ] : <cxxflags>-fopenmp ;
-
-import testing ;
-
-local with-eigen = [ option.get "with-eigen" ] ;
-if $(with-eigen) {
- fakelib tuning : tune_instance.cc tune_derivatives.cc interp ..//kenlm : <include>$(with-eigen) ;
- unit-test tune_derivatives_test : tune_derivatives_test.cc tuning /top//boost_unit_test_framework : <include>$(with-eigen) ;
-
- obj tune_instance_test.o : tune_instance_test.cc /top//boost_unit_test_framework : <include>$(with-eigen) ;
- run tune_instance_test.o tuning /top//boost_unit_test_framework : : tune_instance_data/toy0.1 ;
-
- exe tune : tune_main.cc tuning /top//boost_program_options : <include>$(with-eigen) ;
-}
-
-exe interpolate : interpolate_main.cc interp /top//boost_program_options ;
-exe streaming_example : ../builder//builder interp streaming_example_main.cc /top//boost_program_options ;
-
-unit-test normalize_test : interp normalize_test.cc /top//boost_unit_test_framework ;
-unit-test backoff_reunification_test : interp backoff_reunification_test.cc /top//boost_unit_test_framework ;
-unit-test bounded_sequence_encoding_test : interp bounded_sequence_encoding_test.cc /top//boost_unit_test_framework ;
-run merge_vocab_test.cc interp /top//boost_unit_test_framework : : merge_test/test1 merge_test/test2 merge_test/test3 merge_test/test_bad_order merge_test/test_no_unk ;
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/backoff_matrix.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/backoff_matrix.hh b/ext/kenlm/lm/interpolate/backoff_matrix.hh
deleted file mode 100644
index c7552df..0000000
--- a/ext/kenlm/lm/interpolate/backoff_matrix.hh
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef LM_INTERPOLATE_BACKOFF_MATRIX_H
-#define LM_INTERPOLATE_BACKOFF_MATRIX_H
-
-#include <cstddef>
-#include <vector>
-
-namespace lm { namespace interpolate {
-
-class BackoffMatrix {
- public:
- BackoffMatrix(std::size_t num_models, std::size_t max_order)
- : max_order_(max_order), backing_(num_models * max_order) {}
-
- float &Backoff(std::size_t model, std::size_t order_minus_1) {
- return backing_[model * max_order_ + order_minus_1];
- }
-
- float Backoff(std::size_t model, std::size_t order_minus_1) const {
- return backing_[model * max_order_ + order_minus_1];
- }
-
- private:
- const std::size_t max_order_;
- std::vector<float> backing_;
-};
-
-}} // namespaces
-
-#endif // LM_INTERPOLATE_BACKOFF_MATRIX_H
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/backoff_reunification.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/backoff_reunification.cc b/ext/kenlm/lm/interpolate/backoff_reunification.cc
deleted file mode 100644
index edf6ddc..0000000
--- a/ext/kenlm/lm/interpolate/backoff_reunification.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-#include "lm/interpolate/backoff_reunification.hh"
-#include "lm/common/model_buffer.hh"
-#include "lm/common/ngram_stream.hh"
-#include "lm/common/ngram.hh"
-#include "lm/common/compare.hh"
-
-#include <cassert>
-
-namespace lm {
-namespace interpolate {
-
-namespace {
-class MergeWorker {
-public:
- MergeWorker(std::size_t order, const util::stream::ChainPosition &prob_pos,
- const util::stream::ChainPosition &boff_pos)
- : order_(order), prob_pos_(prob_pos), boff_pos_(boff_pos) {
- // nothing
- }
-
- void Run(const util::stream::ChainPosition &position) {
- lm::NGramStream<ProbBackoff> stream(position);
-
- lm::NGramStream<float> prob_input(prob_pos_);
- util::stream::Stream boff_input(boff_pos_);
- for (; prob_input && boff_input; ++prob_input, ++boff_input, ++stream) {
- std::copy(prob_input->begin(), prob_input->end(), stream->begin());
- stream->Value().prob = prob_input->Value();
- stream->Value().backoff = *reinterpret_cast<float *>(boff_input.Get());
- }
- UTIL_THROW_IF2(prob_input || boff_input,
- "Streams were not the same size during merging");
- stream.Poison();
- }
-
-private:
- std::size_t order_;
- util::stream::ChainPosition prob_pos_;
- util::stream::ChainPosition boff_pos_;
-};
-}
-
-// Since we are *adding* something to the output chain here, we pass in the
-// chain itself so that we can safely add a new step to the chain without
-// creating a deadlock situation (since creating a new ChainPosition will
-// make a new input/output pair---we want that position to be created
-// *here*, not before).
-void ReunifyBackoff(util::stream::ChainPositions &prob_pos,
- util::stream::ChainPositions &boff_pos,
- util::stream::Chains &output_chains) {
- assert(prob_pos.size() == boff_pos.size());
-
- for (size_t i = 0; i < prob_pos.size(); ++i)
- output_chains[i] >> MergeWorker(i + 1, prob_pos[i], boff_pos[i]);
-}
-}
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/backoff_reunification.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/backoff_reunification.hh b/ext/kenlm/lm/interpolate/backoff_reunification.hh
deleted file mode 100644
index 327db65..0000000
--- a/ext/kenlm/lm/interpolate/backoff_reunification.hh
+++ /dev/null
@@ -1,27 +0,0 @@
-#ifndef KENLM_INTERPOLATE_BACKOFF_REUNIFICATION_
-#define KENLM_INTERPOLATE_BACKOFF_REUNIFICATION_
-
-#include "util/stream/stream.hh"
-#include "util/stream/multi_stream.hh"
-
-namespace lm {
-namespace interpolate {
-
-/**
- * The third pass for the offline log-linear interpolation algorithm. This
- * reads **suffix-ordered** probability values (ngram-id, float) and
- * **suffix-ordered** backoff values (float) and writes the merged contents
- * to the output.
- *
- * @param prob_pos The chain position for each order from which to read
- * the probability values
- * @param boff_pos The chain position for each order from which to read
- * the backoff values
- * @param output_chains The output chains for each order
- */
-void ReunifyBackoff(util::stream::ChainPositions &prob_pos,
- util::stream::ChainPositions &boff_pos,
- util::stream::Chains &output_chains);
-}
-}
-#endif
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/backoff_reunification_test.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/backoff_reunification_test.cc b/ext/kenlm/lm/interpolate/backoff_reunification_test.cc
deleted file mode 100644
index d8faf94..0000000
--- a/ext/kenlm/lm/interpolate/backoff_reunification_test.cc
+++ /dev/null
@@ -1,159 +0,0 @@
-#include "lm/interpolate/backoff_reunification.hh"
-#include "lm/common/ngram_stream.hh"
-
-#define BOOST_TEST_MODULE InterpolateBackoffReunificationTest
-#include <boost/test/unit_test.hpp>
-
-namespace lm {
-namespace interpolate {
-
-namespace {
-
-// none of this input actually makes sense, all we care about is making
-// sure the merging works
-template <uint8_t N>
-struct Gram {
- WordIndex ids[N];
- float prob;
- float boff;
-};
-
-template <uint8_t N>
-struct Grams {
- const static Gram<N> grams[];
-};
-
-template <>
-const Gram<1> Grams<1>::grams[]
- = {{{0}, 0.1f, 0.1f}, {{1}, 0.4f, 0.2f}, {{2}, 0.5f, 0.1f}};
-
-template <>
-const Gram<2> Grams<2>::grams[] = {{{0, 0}, 0.05f, 0.05f},
- {{1, 0}, 0.05f, 0.02f},
- {{1, 1}, 0.2f, 0.04f},
- {{2, 2}, 0.2f, 0.01f}};
-
-template <>
-const Gram<3> Grams<3>::grams[] = {{{0, 0, 0}, 0.001f, 0.005f},
- {{1, 0, 0}, 0.001f, 0.002f},
- {{2, 0, 0}, 0.001f, 0.003f},
- {{0, 1, 0}, 0.1f, 0.008f},
- {{1, 1, 0}, 0.1f, 0.09f},
- {{1, 1, 1}, 0.2f, 0.08f}};
-
-template <uint8_t N>
-class WriteInput {
-public:
- void Run(const util::stream::ChainPosition &position) {
- lm::NGramStream<float> output(position);
-
- for (std::size_t i = 0; i < sizeof(Grams<N>::grams) / sizeof(Gram<N>);
- ++i, ++output) {
- std::copy(Grams<N>::grams[i].ids, Grams<N>::grams[i].ids + N,
- output->begin());
- output->Value() = Grams<N>::grams[i].prob;
- }
- output.Poison();
- }
-};
-
-template <uint8_t N>
-class WriteBackoffs {
-public:
- void Run(const util::stream::ChainPosition &position) {
- util::stream::Stream output(position);
-
- for (std::size_t i = 0; i < sizeof(Grams<N>::grams) / sizeof(Gram<N>);
- ++i, ++output) {
- *reinterpret_cast<float *>(output.Get()) = Grams<N>::grams[i].boff;
- }
- output.Poison();
- }
-};
-
-template <uint8_t N>
-class CheckOutput {
-public:
- void Run(const util::stream::ChainPosition &position) {
- lm::NGramStream<ProbBackoff> stream(position);
-
- std::size_t i = 0;
- for (; stream; ++stream, ++i) {
- std::stringstream ss;
- for (WordIndex *idx = stream->begin(); idx != stream->end(); ++idx)
- ss << "(" << *idx << ")";
-
- UTIL_THROW_IF2(
- !std::equal(stream->begin(), stream->end(), Grams<N>::grams[i].ids),
- "Mismatched id in CheckOutput<" << (int)N << ">: " << ss.str());
-
- UTIL_THROW_IF2(stream->Value().prob != Grams<N>::grams[i].prob,
- "Mismatched probability in CheckOutput<"
- << (int)N << ">, got " << stream->Value().prob
- << ", expected " << Grams<N>::grams[i].prob);
-
- UTIL_THROW_IF2(stream->Value().backoff != Grams<N>::grams[i].boff,
- "Mismatched backoff in CheckOutput<"
- << (int)N << ">, got " << stream->Value().backoff
- << ", expected " << Grams<N>::grams[i].boff);
- }
- UTIL_THROW_IF2(i != sizeof(Grams<N>::grams) / sizeof(Gram<N>),
- "Did not get correct number of "
- << (int)N << "-grams: expected "
- << sizeof(Grams<N>::grams) / sizeof(Gram<N>)
- << ", got " << i);
- }
-};
-}
-
-BOOST_AUTO_TEST_CASE(BackoffReunificationTest) {
- util::stream::ChainConfig config;
- config.total_memory = 100;
- config.block_count = 1;
-
- util::stream::Chains prob_chains(3);
- config.entry_size = NGram<float>::TotalSize(1);
- prob_chains.push_back(config);
- prob_chains.back() >> WriteInput<1>();
-
- config.entry_size = NGram<float>::TotalSize(2);
- prob_chains.push_back(config);
- prob_chains.back() >> WriteInput<2>();
-
- config.entry_size = NGram<float>::TotalSize(3);
- prob_chains.push_back(config);
- prob_chains.back() >> WriteInput<3>();
-
- util::stream::Chains boff_chains(3);
- config.entry_size = sizeof(float);
- boff_chains.push_back(config);
- boff_chains.back() >> WriteBackoffs<1>();
-
- boff_chains.push_back(config);
- boff_chains.back() >> WriteBackoffs<2>();
-
- boff_chains.push_back(config);
- boff_chains.back() >> WriteBackoffs<3>();
-
- util::stream::ChainPositions prob_pos(prob_chains);
- util::stream::ChainPositions boff_pos(boff_chains);
-
- util::stream::Chains output_chains(3);
- for (std::size_t i = 0; i < 3; ++i) {
- config.entry_size = NGram<ProbBackoff>::TotalSize(i + 1);
- output_chains.push_back(config);
- }
-
- ReunifyBackoff(prob_pos, boff_pos, output_chains);
-
- output_chains[0] >> CheckOutput<1>();
- output_chains[1] >> CheckOutput<2>();
- output_chains[2] >> CheckOutput<3>();
-
- prob_chains >> util::stream::kRecycle;
- boff_chains >> util::stream::kRecycle;
-
- output_chains.Wait();
-}
-}
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/bounded_sequence_encoding.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/bounded_sequence_encoding.cc b/ext/kenlm/lm/interpolate/bounded_sequence_encoding.cc
deleted file mode 100644
index aca8ed7..0000000
--- a/ext/kenlm/lm/interpolate/bounded_sequence_encoding.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-#include "lm/interpolate/bounded_sequence_encoding.hh"
-
-#include <algorithm>
-
-namespace lm { namespace interpolate {
-
-BoundedSequenceEncoding::BoundedSequenceEncoding(const unsigned char *bound_begin, const unsigned char *bound_end)
- : entries_(bound_end - bound_begin) {
- std::size_t full = 0;
- Entry entry;
- entry.shift = 0;
- for (const unsigned char *i = bound_begin; i != bound_end; ++i) {
- uint8_t length;
- if (*i <= 1) {
- length = 0;
- } else {
- length = sizeof(unsigned int) * 8 - __builtin_clz((unsigned int)*i);
- }
- entry.mask = (1ULL << length) - 1ULL;
- if (entry.shift + length > 64) {
- entry.shift = 0;
- entry.next = true;
- ++full;
- } else {
- entry.next = false;
- }
- entries_.push_back(entry);
- entry.shift += length;
- }
- byte_length_ = full * sizeof(uint64_t) + (entry.shift + 7) / 8;
- first_copy_ = std::min<std::size_t>(byte_length_, sizeof(uint64_t));
- // Size of last uint64_t. Zero if empty, otherwise [1,8] depending on mod.
- overhang_ = byte_length_ == 0 ? 0 : ((byte_length_ - 1) % 8 + 1);
-}
-
-}} // namespaces
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/bounded_sequence_encoding.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/bounded_sequence_encoding.hh b/ext/kenlm/lm/interpolate/bounded_sequence_encoding.hh
deleted file mode 100644
index 84dd63a..0000000
--- a/ext/kenlm/lm/interpolate/bounded_sequence_encoding.hh
+++ /dev/null
@@ -1,76 +0,0 @@
-#ifndef LM_INTERPOLATE_BOUNDED_SEQUENCE_ENCODING_H
-#define LM_INTERPOLATE_BOUNDED_SEQUENCE_ENCODING_H
-
-/* Encodes fixed-length sequences of integers with known bounds on each entry.
- * This is used to encode how far each model has backed off.
- * TODO: make this class efficient. Bit-level packing or multiply by bound and
- * add.
- */
-
-#include "util/exception.hh"
-#include "util/fixed_array.hh"
-
-#if BYTE_ORDER != LITTLE_ENDIAN
-#warning The interpolation code assumes little endian for now.
-#endif
-
-#include <algorithm>
-#include <cstring>
-
-namespace lm {
-namespace interpolate {
-
-class BoundedSequenceEncoding {
- public:
- // Encode [0, bound_begin[0]) x [0, bound_begin[1]) x [0, bound_begin[2]) x ... x [0, *(bound_end - 1)) for entries in the sequence
- BoundedSequenceEncoding(const unsigned char *bound_begin, const unsigned char *bound_end);
-
- std::size_t Entries() const { return entries_.size(); }
-
- std::size_t EncodedLength() const { return byte_length_; }
-
- void Encode(const unsigned char *from, void *to_void) const {
- uint8_t *to = static_cast<uint8_t*>(to_void);
- uint64_t cur = 0;
- for (const Entry *i = entries_.begin(); i != entries_.end(); ++i, ++from) {
- if (UTIL_UNLIKELY(i->next)) {
- std::memcpy(to, &cur, sizeof(uint64_t));
- to += sizeof(uint64_t);
- cur = 0;
- }
- cur |= static_cast<uint64_t>(*from) << i->shift;
- }
- memcpy(to, &cur, overhang_);
- }
-
- void Decode(const void *from_void, unsigned char *to) const {
- const uint8_t *from = static_cast<const uint8_t*>(from_void);
- uint64_t cur = 0;
- memcpy(&cur, from, first_copy_);
- for (const Entry *i = entries_.begin(); i != entries_.end(); ++i, ++to) {
- if (UTIL_UNLIKELY(i->next)) {
- from += sizeof(uint64_t);
- cur = 0;
- std::memcpy(&cur, from,
- std::min<std::size_t>(sizeof(uint64_t), static_cast<const uint8_t*>(from_void) + byte_length_ - from));
- }
- *to = (cur >> i->shift) & i->mask;
- }
- }
-
- private:
- struct Entry {
- bool next;
- uint8_t shift;
- uint64_t mask;
- };
- util::FixedArray<Entry> entries_;
- std::size_t byte_length_;
- std::size_t first_copy_;
- std::size_t overhang_;
-};
-
-
-}} // namespaces
-
-#endif // LM_INTERPOLATE_BOUNDED_SEQUENCE_ENCODING_H
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/bounded_sequence_encoding_test.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/bounded_sequence_encoding_test.cc b/ext/kenlm/lm/interpolate/bounded_sequence_encoding_test.cc
deleted file mode 100644
index 2c4bbd9..0000000
--- a/ext/kenlm/lm/interpolate/bounded_sequence_encoding_test.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-#include "lm/interpolate/bounded_sequence_encoding.hh"
-
-#include "util/scoped.hh"
-
-#define BOOST_TEST_MODULE BoundedSequenceEncodingTest
-#include <boost/test/unit_test.hpp>
-
-namespace lm {
-namespace interpolate {
-namespace {
-
-void ExhaustiveTest(unsigned char *bound_begin, unsigned char *bound_end) {
- BoundedSequenceEncoding enc(bound_begin, bound_end);
- util::scoped_malloc backing(util::MallocOrThrow(enc.EncodedLength()));
- std::vector<unsigned char> values(bound_end - bound_begin),
- out(bound_end - bound_begin);
- while (true) {
- enc.Encode(&values[0], backing.get());
- enc.Decode(backing.get(), &out[0]);
- for (std::size_t i = 0; i != values.size(); ++i) {
- BOOST_CHECK_EQUAL(values[i], out[i]);
- }
- for (std::size_t i = 0;; ++i) {
- if (i == values.size()) return;
- ++values[i];
- if (values[i] < bound_begin[i]) break;
- values[i] = 0;
- }
- }
-}
-
-void CheckEncodeDecode(unsigned char *bounds, unsigned char *input,
- unsigned char *output, std::size_t len) {
- BoundedSequenceEncoding encoder(bounds, bounds + len);
- util::scoped_malloc backing(util::MallocOrThrow(encoder.EncodedLength()));
-
- encoder.Encode(input, backing.get());
- encoder.Decode(backing.get(), output);
-
- for (std::size_t i = 0; i < len; ++i) {
- BOOST_CHECK_EQUAL(input[i], output[i]);
- }
-}
-
-BOOST_AUTO_TEST_CASE(Exhaustive) {
- unsigned char bounds[] = {5, 2, 3, 9, 7, 20, 8};
- ExhaustiveTest(bounds, bounds + sizeof(bounds) / sizeof(unsigned char));
-}
-
-BOOST_AUTO_TEST_CASE(LessThan64) {
- unsigned char bounds[] = {255, 255, 255, 255, 255, 255, 255, 3};
- unsigned char input[] = {172, 183, 254, 187, 96, 87, 65, 2};
- unsigned char output[] = {0, 0, 0, 0, 0, 0, 0, 0};
-
- std::size_t len = sizeof(bounds) / sizeof(unsigned char);
- assert(sizeof(input) / sizeof(unsigned char) == len);
- assert(sizeof(output) / sizeof(unsigned char) == len);
-
- CheckEncodeDecode(bounds, input, output, len);
-}
-
-BOOST_AUTO_TEST_CASE(Exactly64) {
- unsigned char bounds[] = {255, 255, 255, 255, 255, 255, 255, 255};
- unsigned char input[] = {172, 183, 254, 187, 96, 87, 65, 16};
- unsigned char output[] = {0, 0, 0, 0, 0, 0, 0, 0};
-
- std::size_t len = sizeof(bounds) / sizeof(unsigned char);
- assert(sizeof(input) / sizeof(unsigned char) == len);
- assert(sizeof(output) / sizeof(unsigned char) == len);
-
- CheckEncodeDecode(bounds, input, output, len);
-}
-
-BOOST_AUTO_TEST_CASE(MoreThan64) {
- unsigned char bounds[] = {255, 255, 255, 255, 255, 255, 255, 255, 255};
- unsigned char input[] = {172, 183, 254, 187, 96, 87, 65, 16, 137};
- unsigned char output[] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
-
- std::size_t len = sizeof(bounds) / sizeof(unsigned char);
- assert(sizeof(input) / sizeof(unsigned char) == len);
- assert(sizeof(output) / sizeof(unsigned char) == len);
-
- CheckEncodeDecode(bounds, input, output, len);
-}
-
-}}} // namespaces
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/enumerate_global_vocab.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/enumerate_global_vocab.cc b/ext/kenlm/lm/interpolate/enumerate_global_vocab.cc
deleted file mode 100644
index 26f63c8..0000000
--- a/ext/kenlm/lm/interpolate/enumerate_global_vocab.cc
+++ /dev/null
@@ -1,48 +0,0 @@
-
-#include "lm/interpolate/enumerate_global_vocab.hh"
-
-#include <iostream>
-#include <map>
-
-namespace lm {
-//constructor
-
- EnumerateGlobalVocab::EnumerateGlobalVocab(std::map<std::string, int*> * vm, int nm) {
-
- vmap = vm;
- num_models = nm;
- cur_model = 0; //blah
- cnt = 0;
- std::cerr << "Vocab Builder with models: " << nm << std::endl;
- }
-
- void EnumerateGlobalVocab::Add(WordIndex index, const StringPiece &str) {
-
- std::string st = str.as_string();
-
- //check for existence of key
- std::map<std::string, int*>::iterator itr = vmap->find(st);
-
- //put stuff
- if(itr != vmap->end()) {
- std::cerr << "Vocab exist: " << str << " M: " << cur_model << " I:" << index << std::endl;
- itr->second[cur_model] = index;
- }
- //new key
- else {
-
- //create model index map for this vocab word
- //init to 0, 0 is UNK
- int * indices = new int[num_models];
- memset(indices, 0, (sizeof(int)*num_models)); //this still legit?
-
- indices[cur_model] = index;
- (*vmap)[st] = indices;
- std::cerr << cnt << ":Vocab add: " << str << " M: " << cur_model << " I:" << index << std::endl;
- cnt++;
- }
-
-
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/enumerate_global_vocab.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/enumerate_global_vocab.hh b/ext/kenlm/lm/interpolate/enumerate_global_vocab.hh
deleted file mode 100644
index c37d649..0000000
--- a/ext/kenlm/lm/interpolate/enumerate_global_vocab.hh
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef LM_ENUMERATE_GLOBAL_VOCAB_H
-#define LM_ENUMERATE_GLOBAL_VOCAB_H
-
-#include "lm/enumerate_vocab.hh"
-#include <map>
-
-/* Use this to create a global vocab across models for use when
- * calculating lambdas for interpolation. Or other stuff.
- */
-namespace lm {
-
- class EnumerateGlobalVocab : EnumerateVocab {
-
- public:
-
- //yes, ugly...
- std::map<std::string, int*> * vmap;
- int num_models;
- int cur_model;
- int cnt; //stupid
-
- ~EnumerateGlobalVocab() {}
-
- void Add(WordIndex index, const StringPiece & str);
-
- EnumerateGlobalVocab(std::map<std::string, int*> *, int);
-
- void SetCurModel(int i) { cur_model = i; }
-
- protected:
- EnumerateGlobalVocab() {}
-
- };
-
-} //namespace lm
-
-#endif // LM_ENUMERATE_GLOBAL_VOCAB_H
-
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/interpolate_info.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/interpolate_info.hh b/ext/kenlm/lm/interpolate/interpolate_info.hh
deleted file mode 100644
index ebecd92..0000000
--- a/ext/kenlm/lm/interpolate/interpolate_info.hh
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef KENLM_INTERPOLATE_INTERPOLATE_INFO_H
-#define KENLM_INTERPOLATE_INTERPOLATE_INFO_H
-
-#include <cstddef>
-#include <vector>
-#include <stdint.h>
-
-namespace lm {
-namespace interpolate {
-
-/**
- * Stores relevant info for interpolating several language models, for use
- * during the three-pass offline log-linear interpolation algorithm.
- */
-struct InterpolateInfo {
- /**
- * @return the number of models being interpolated
- */
- std::size_t Models() const {
- return orders.size();
- }
-
- /**
- * The lambda (interpolation weight) for each model.
- */
- std::vector<float> lambdas;
-
- /**
- * The maximum ngram order for each model.
- */
- std::vector<uint8_t> orders;
-};
-}
-}
-#endif
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/interpolate_main.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/interpolate_main.cc b/ext/kenlm/lm/interpolate/interpolate_main.cc
deleted file mode 100644
index a99b62d..0000000
--- a/ext/kenlm/lm/interpolate/interpolate_main.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-#include "lm/common/model_buffer.hh"
-#include "lm/common/size_option.hh"
-#include "lm/interpolate/pipeline.hh"
-#include "util/fixed_array.hh"
-#include "util/usage.hh"
-
-#include <boost/program_options.hpp>
-
-#include <iostream>
-#include <vector>
-
-int main(int argc, char *argv[]) {
- lm::interpolate::Config config;
- std::vector<std::string> input_models;
- namespace po = boost::program_options;
- po::options_description options("Log-linear interpolation options");
- options.add_options()
- ("lambda,w", po::value<std::vector<float> >(&config.lambdas)->multitoken()->required(), "Interpolation weights")
- ("model,m", po::value<std::vector<std::string> >(&input_models)->multitoken()->required(), "Models to interpolate")
- ("temp_prefix,T", po::value<std::string>(&config.sort.temp_prefix)->default_value("/tmp/lm"), "Temporary file prefix")
- ("memory,S", lm::SizeOption(config.sort.total_memory, util::GuessPhysicalMemory() ? "50%" : "1G"), "Sorting memory")
- ("sort_block", lm::SizeOption(config.sort.buffer_size, "64M"), "Block size");
- po::variables_map vm;
- po::store(po::parse_command_line(argc, argv, options), vm);
- po::notify(vm);
-
- if (config.lambdas.size() != input_models.size()) {
- std::cerr << "Number of models " << input_models.size() << " should match the number of weights" << config.lambdas.size() << "." << std::endl;
- return 1;
- }
-
- util::FixedArray<lm::ModelBuffer> models(input_models.size());
- for (std::size_t i = 0; i < input_models.size(); ++i) {
- models.push_back(input_models[i]);
- }
- lm::interpolate::Pipeline(models, config, 1);
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_probabilities.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/merge_probabilities.cc b/ext/kenlm/lm/interpolate/merge_probabilities.cc
deleted file mode 100644
index b6c949f..0000000
--- a/ext/kenlm/lm/interpolate/merge_probabilities.cc
+++ /dev/null
@@ -1,285 +0,0 @@
-#include "lm/interpolate/merge_probabilities.hh"
-#include "lm/common/ngram_stream.hh"
-#include "lm/interpolate/bounded_sequence_encoding.hh"
-#include "lm/interpolate/interpolate_info.hh"
-
-#include <algorithm>
-#include <limits>
-#include <numeric>
-
-namespace lm {
-namespace interpolate {
-
-/**
- * Helper to generate the BoundedSequenceEncoding used for writing the
- * from values.
- */
-BoundedSequenceEncoding MakeEncoder(const InterpolateInfo &info, uint8_t order) {
- util::FixedArray<uint8_t> max_orders(info.orders.size());
- for (std::size_t i = 0; i < info.orders.size(); ++i) {
- max_orders.push_back(std::min(order, info.orders[i]));
- }
- return BoundedSequenceEncoding(max_orders.begin(), max_orders.end());
-}
-
-namespace {
-
-/**
- * A simple wrapper class that holds information needed to read and write
- * the ngrams of a particular order. This class has the memory needed to
- * buffer the data needed for the recursive process of computing the
- * probabilities and "from" values for each component model.
- *
- * "From" values indicate, for each model, what order (as an index, so -1)
- * was backed off to in order to arrive at a probability. For example, if a
- * 5-gram model (order index 4) backed off twice, we would write a 2.
- */
-class NGramHandler {
-public:
- NGramHandler(uint8_t order, const InterpolateInfo &ifo,
- util::FixedArray<util::stream::ChainPositions> &models_by_order)
- : info(ifo),
- encoder(MakeEncoder(info, order)),
- out_record(order, encoder.EncodedLength()) {
- std::size_t count_has_order = 0;
- for (std::size_t i = 0; i < models_by_order.size(); ++i) {
- count_has_order += (models_by_order[i].size() >= order);
- }
- inputs_.Init(count_has_order);
- for (std::size_t i = 0; i < models_by_order.size(); ++i) {
- if (models_by_order[i].size() < order)
- continue;
- inputs_.push_back(models_by_order[i][order - 1]);
- if (inputs_.back()) {
- active_.resize(active_.size() + 1);
- active_.back().model = i;
- active_.back().stream = &inputs_.back();
- }
- }
-
- // have to init outside since NGramStreams doesn't forward to
- // GenericStreams ctor given a ChainPositions
-
- probs.Init(info.Models());
- from.Init(info.Models());
- for (std::size_t i = 0; i < info.Models(); ++i) {
- probs.push_back(0.0);
- from.push_back(0);
- }
- }
-
- struct StreamIndex {
- NGramStream<ProbBackoff> *stream;
- NGramStream<ProbBackoff> &Stream() { return *stream; }
- std::size_t model;
- };
-
- std::size_t ActiveSize() const {
- return active_.size();
- }
-
- /**
- * @return the input stream for a particular model that corresponds to
- * this ngram order
- */
- StreamIndex &operator[](std::size_t idx) {
- return active_[idx];
- }
-
- void erase(std::size_t idx) {
- active_.erase(active_.begin() + idx);
- }
-
- const InterpolateInfo &info;
- BoundedSequenceEncoding encoder;
- PartialProbGamma out_record;
- util::FixedArray<float> probs;
- util::FixedArray<uint8_t> from;
-
-private:
- std::vector<StreamIndex> active_;
- NGramStreams<ProbBackoff> inputs_;
-};
-
-/**
- * A collection of NGramHandlers.
- */
-class NGramHandlers : public util::FixedArray<NGramHandler> {
-public:
- explicit NGramHandlers(std::size_t num)
- : util::FixedArray<NGramHandler>(num) {
- }
-
- void push_back(
- std::size_t order, const InterpolateInfo &info,
- util::FixedArray<util::stream::ChainPositions> &models_by_order) {
- new (end()) NGramHandler(order, info, models_by_order);
- Constructed();
- }
-};
-
-/**
- * The recursive helper function that computes probability and "from"
- * values for all ngrams matching a particular suffix.
- *
- * The current order can be computed as the suffix length + 1. Note that
- * the suffix could be empty (suffix_begin == suffix_end == NULL), in which
- * case we are handling unigrams with the UNK token as the fallback
- * probability.
- *
- * @param handlers The full collection of handlers
- * @param suffix_begin A start iterator for the suffix
- * @param suffix_end An end iterator for the suffix
- * @param fallback_probs The probabilities of this ngram if we need to
- * back off (that is, the probability of the suffix)
- * @param fallback_from The order that the corresponding fallback
- * probability in the fallback_probs is from
- * @param combined_fallback interpolated fallback_probs
- * @param outputs The output streams, one for each order
- */
-void HandleSuffix(NGramHandlers &handlers, WordIndex *suffix_begin,
- WordIndex *suffix_end,
- const util::FixedArray<float> &fallback_probs,
- const util::FixedArray<uint8_t> &fallback_from,
- float combined_fallback,
- util::stream::Streams &outputs) {
- uint8_t order = std::distance(suffix_begin, suffix_end) + 1;
- if (order > outputs.size()) return;
-
- util::stream::Stream &output = outputs[order - 1];
- NGramHandler &handler = handlers[order - 1];
-
- while (true) {
- // find the next smallest ngram which matches our suffix
- // TODO: priority queue driven.
- WordIndex *minimum = NULL;
- for (std::size_t i = 0; i < handler.ActiveSize(); ++i) {
- if (!std::equal(suffix_begin, suffix_end, handler[i].Stream()->begin() + 1))
- continue;
-
- // if we either haven't set a minimum yet or this one is smaller than
- // the minimum we found before, replace it
- WordIndex *last = handler[i].Stream()->begin();
- if (!minimum || *last < *minimum) { minimum = handler[i].Stream()->begin(); }
- }
-
- // no more ngrams of this order match our suffix, so we're done
- if (!minimum) return;
-
- handler.out_record.ReBase(output.Get());
- std::copy(minimum, minimum + order, handler.out_record.begin());
-
- // Default case is having backed off.
- std::copy(fallback_probs.begin(), fallback_probs.end(), handler.probs.begin());
- std::copy(fallback_from.begin(), fallback_from.end(), handler.from.begin());
-
- for (std::size_t i = 0; i < handler.ActiveSize();) {
- if (std::equal(handler.out_record.begin(), handler.out_record.end(),
- handler[i].Stream()->begin())) {
- handler.probs[handler[i].model] = handler.info.lambdas[handler[i].model] * handler[i].Stream()->Value().prob;
- handler.from[handler[i].model] = order - 1;
- if (++handler[i].Stream()) {
- ++i;
- } else {
- handler.erase(i);
- }
- } else {
- ++i;
- }
- }
- handler.out_record.Prob() = std::accumulate(handler.probs.begin(), handler.probs.end(), 0.0);
- handler.out_record.LowerProb() = combined_fallback;
- handler.encoder.Encode(handler.from.begin(),
- handler.out_record.FromBegin());
-
- // we've handled this particular ngram, so now recurse to the higher
- // order using the current ngram as the suffix
- HandleSuffix(handlers, handler.out_record.begin(), handler.out_record.end(),
- handler.probs, handler.from, handler.out_record.Prob(), outputs);
- // consume the output
- ++output;
- }
-}
-
-/**
- * Kicks off the recursion for computing the probabilities and "from"
- * values for each ngram order. We begin by handling the UNK token that
- * should be at the front of each of the unigram input streams. This is
- * then output to the stream and it is used as the fallback for handling
- * our unigram case, the unigram used as the fallback for the bigram case,
- * etc.
- */
-void HandleNGrams(NGramHandlers &handlers, util::stream::Streams &outputs) {
- PartialProbGamma unk_record(1, 0);
- // First: populate the unk probabilities by reading the first unigram
- // from each stream
- util::FixedArray<float> unk_probs(handlers[0].info.Models());
-
- // start by populating the ngram id from the first stream
- lm::NGram<ProbBackoff> ngram = *handlers[0][0].Stream();
- unk_record.ReBase(outputs[0].Get());
- std::copy(ngram.begin(), ngram.end(), unk_record.begin());
- unk_record.Prob() = 0;
-
- // then populate the probabilities into unk_probs while "multiply" the
- // model probabilities together into the unk record
- //
- // note that from doesn't need to be set for unigrams
- assert(handlers[0].ActiveSize() == handlers[0].info.Models());
- for (std::size_t i = 0; i < handlers[0].info.Models();) {
- ngram = *handlers[0][i].Stream();
- unk_probs.push_back(handlers[0].info.lambdas[i] * ngram.Value().prob);
- unk_record.Prob() += unk_probs[i];
- assert(*ngram.begin() == kUNK);
- if (++handlers[0][i].Stream()) {
- ++i;
- } else {
- handlers[0].erase(i);
- }
- }
- float unk_combined = unk_record.Prob();
- unk_record.LowerProb() = unk_combined;
- // flush the unk output record
- ++outputs[0];
-
- // Then, begin outputting everything in lexicographic order: first we'll
- // get the unigram then the first bigram with that context, then the
- // first trigram with that bigram context, etc., until we exhaust all of
- // the ngrams, then all of the (n-1)grams, etc.
- //
- // This function is the "root" of this recursive process.
- util::FixedArray<uint8_t> unk_from(handlers[0].info.Models());
- for (std::size_t i = 0; i < handlers[0].info.Models(); ++i) {
- unk_from.push_back(0);
- }
-
- // the two nulls are to encode that our "fallback" word is the "0-gram"
- // case, e.g. we "backed off" to UNK
- // TODO: stop generating vocab ids and LowerProb for unigrams.
- HandleSuffix(handlers, NULL, NULL, unk_probs, unk_from, unk_combined, outputs);
-
- // Verify we reached the end. And poison!
- for (std::size_t i = 0; i < handlers.size(); ++i) {
- UTIL_THROW_IF2(handlers[i].ActiveSize(),
- "MergeProbabilities did not exhaust all ngram streams");
- outputs[i].Poison();
- }
-}
-}
-
-void MergeProbabilities(
- const InterpolateInfo &info,
- util::FixedArray<util::stream::ChainPositions> &models_by_order,
- util::stream::Chains &output_chains) {
- NGramHandlers handlers(output_chains.size());
- for (std::size_t i = 0; i < output_chains.size(); ++i) {
- handlers.push_back(i + 1, info, models_by_order);
- }
-
- util::stream::ChainPositions output_pos(output_chains);
- util::stream::Streams outputs(output_pos);
-
- HandleNGrams(handlers, outputs);
-}
-}
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_probabilities.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/merge_probabilities.hh b/ext/kenlm/lm/interpolate/merge_probabilities.hh
deleted file mode 100644
index 59b8046..0000000
--- a/ext/kenlm/lm/interpolate/merge_probabilities.hh
+++ /dev/null
@@ -1,89 +0,0 @@
-#ifndef LM_INTERPOLATE_MERGE_PROBABILITIES_H
-#define LM_INTERPOLATE_MERGE_PROBABILITIES_H
-
-#include "lm/common/ngram.hh"
-#include "lm/interpolate/bounded_sequence_encoding.hh"
-#include "util/fixed_array.hh"
-#include "util/stream/multi_stream.hh"
-
-#include <stdint.h>
-
-namespace lm {
-namespace interpolate {
-
-struct InterpolateInfo;
-
-/**
- * Make the encoding of backoff values for a given order. This stores values
- * in [PartialProbGamma::FromBegin(), PartialProbGamma::FromEnd())
- */
-BoundedSequenceEncoding MakeEncoder(const InterpolateInfo &info, uint8_t order);
-
-/**
- * The first pass for the offline log-linear interpolation algorithm. This
- * reads K **suffix-ordered** streams for each model, for each order, of
- * ngram records (ngram-id, prob, backoff). It further assumes that the
- * ngram-ids have been unified over all of the stream inputs.
- *
- * Its output is records of (ngram-id, prob-prod, backoff-level,
- * backoff-level, ...) where the backoff-levels (of which there are K) are
- * the context length (0 for unigrams) that the corresponding model had to
- * back off to in order to obtain a probability for that ngram-id. Each of
- * these streams is terminated with a record whose ngram-id is all
- * maximum-integers for simplicity in implementation here.
- *
- * @param models An array of length N (max_i N_i) containing at
- * the ChainPositions for the streams for order (i + 1).
- * @param output_chains The output chains for each order (of length K)
- */
-void MergeProbabilities(
- const InterpolateInfo &info,
- util::FixedArray<util::stream::ChainPositions> &models_by_order,
- util::stream::Chains &output_chains);
-
-/**
- * This class represents the output payload for this pass, which consists
- * of an ngram-id, a probability, and then a vector of orders from which
- * each of the component models backed off to for this ngram, encoded
- * using the BoundedSequenceEncoding class.
- */
-class PartialProbGamma : public lm::NGramHeader {
-public:
- PartialProbGamma(std::size_t order, std::size_t backoff_bytes)
- : lm::NGramHeader(NULL, order), backoff_bytes_(backoff_bytes) {
- // nothing
- }
-
- std::size_t TotalSize() const {
- return sizeof(WordIndex) * Order() + sizeof(After) + backoff_bytes_;
- }
-
- // TODO: cache bounded sequence encoding in the pipeline?
- static std::size_t TotalSize(const InterpolateInfo &info, uint8_t order) {
- return sizeof(WordIndex) * order + sizeof(After) + MakeEncoder(info, order).EncodedLength();
- }
-
- float &Prob() { return Pay().prob; }
- float Prob() const { return Pay().prob; }
-
- float &LowerProb() { return Pay().lower_prob; }
- float LowerProb() const { return Pay().lower_prob; }
-
- const uint8_t *FromBegin() const { return Pay().from; }
- uint8_t *FromBegin() { return Pay().from; }
-
-private:
- struct After {
- // Note that backoff_and_normalize assumes this comes first.
- float prob;
- float lower_prob;
- uint8_t from[];
- };
- const After &Pay() const { return *reinterpret_cast<const After *>(end()); }
- After &Pay() { return *reinterpret_cast<After*>(end()); }
-
- std::size_t backoff_bytes_;
-};
-
-}} // namespaces
-#endif // LM_INTERPOLATE_MERGE_PROBABILITIES_H
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_test/test1
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/merge_test/test1 b/ext/kenlm/lm/interpolate/merge_test/test1
deleted file mode 100644
index 08d114d..0000000
Binary files a/ext/kenlm/lm/interpolate/merge_test/test1 and /dev/null differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_test/test2
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/merge_test/test2 b/ext/kenlm/lm/interpolate/merge_test/test2
deleted file mode 100644
index fd3a380..0000000
Binary files a/ext/kenlm/lm/interpolate/merge_test/test2 and /dev/null differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_test/test3
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/merge_test/test3 b/ext/kenlm/lm/interpolate/merge_test/test3
deleted file mode 100644
index 6c89d7f..0000000
Binary files a/ext/kenlm/lm/interpolate/merge_test/test3 and /dev/null differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_test/test_bad_order
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/merge_test/test_bad_order b/ext/kenlm/lm/interpolate/merge_test/test_bad_order
deleted file mode 100644
index d50a5e8..0000000
Binary files a/ext/kenlm/lm/interpolate/merge_test/test_bad_order and /dev/null differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_test/test_no_unk
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/merge_test/test_no_unk b/ext/kenlm/lm/interpolate/merge_test/test_no_unk
deleted file mode 100644
index fbcf12d..0000000
--- a/ext/kenlm/lm/interpolate/merge_test/test_no_unk
+++ /dev/null
@@ -1 +0,0 @@
-toto
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_vocab.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/merge_vocab.cc b/ext/kenlm/lm/interpolate/merge_vocab.cc
deleted file mode 100644
index 3e84389..0000000
--- a/ext/kenlm/lm/interpolate/merge_vocab.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-#include "lm/interpolate/merge_vocab.hh"
-
-#include "lm/enumerate_vocab.hh"
-#include "lm/interpolate/universal_vocab.hh"
-#include "lm/lm_exception.hh"
-#include "lm/vocab.hh"
-#include "util/file_piece.hh"
-
-#include <queue>
-#include <string>
-#include <iostream>
-#include <vector>
-
-namespace lm {
-namespace interpolate {
-namespace {
-
-class VocabFileReader {
- public:
- explicit VocabFileReader(const int fd, size_t model_num, uint64_t offset = 0);
-
- VocabFileReader &operator++();
- operator bool() const { return !eof_; }
- uint64_t operator*() const { return Value(); }
-
- uint64_t Value() const { return hash_value_; }
- size_t ModelNum() const { return model_num_; }
- WordIndex CurrentIndex() const { return current_index_; }
-
- StringPiece Word() const { return word_; }
-
- private:
- uint64_t hash_value_;
- WordIndex current_index_;
- bool eof_;
- size_t model_num_;
- StringPiece word_;
- util::FilePiece file_piece_;
-};
-
-VocabFileReader::VocabFileReader(const int fd, const size_t model_num, uint64_t offset) :
- hash_value_(0),
- current_index_(0),
- eof_(false),
- model_num_(model_num),
- file_piece_(fd) {
- word_ = file_piece_.ReadLine('\0');
- UTIL_THROW_IF(word_ != "<unk>",
- FormatLoadException,
- "Vocabulary words are in the wrong place.");
- // setup to initial value
- ++*this;
-}
-
-VocabFileReader &VocabFileReader::operator++() {
- try {
- word_ = file_piece_.ReadLine('\0');
- } catch(util::EndOfFileException &e) {
- eof_ = true;
- return *this;
- }
- uint64_t prev_hash_value = hash_value_;
- hash_value_ = ngram::detail::HashForVocab(word_.data(), word_.size());
-
- // hash values should be monotonically increasing
- UTIL_THROW_IF(hash_value_ < prev_hash_value, FormatLoadException,
- ": word index not monotonically increasing."
- << " model_num: " << model_num_
- << " prev hash: " << prev_hash_value
- << " new hash: " << hash_value_);
-
- ++current_index_;
- return *this;
-}
-
-class CompareFiles {
-public:
- bool operator()(const VocabFileReader* x,
- const VocabFileReader* y)
- { return x->Value() > y->Value(); }
-};
-
-class Readers : public util::FixedArray<VocabFileReader> {
- public:
- Readers(std::size_t number) : util::FixedArray<VocabFileReader>(number) {}
- void push_back(int fd, std::size_t i) {
- new(end()) VocabFileReader(fd, i);
- Constructed();
- }
-};
-
-} // namespace
-
-WordIndex MergeVocab(util::FixedArray<util::scoped_fd> &files, UniversalVocab &vocab, EnumerateVocab &enumerate) {
- typedef std::priority_queue<VocabFileReader*, std::vector<VocabFileReader*>, CompareFiles> HeapType;
- HeapType heap;
- Readers readers(files.size());
- for (size_t i = 0; i < files.size(); ++i) {
- readers.push_back(files[i].release(), i);
- heap.push(&readers.back());
- // initialize first index to 0 for <unk>
- vocab.InsertUniversalIdx(i, 0, 0);
- }
-
- uint64_t prev_hash_value = 0;
- // global_index starts with <unk> which is 0
- WordIndex global_index = 0;
-
- enumerate.Add(0, "<unk>");
- while (!heap.empty()) {
- VocabFileReader* top_vocab_file = heap.top();
- if (top_vocab_file->Value() != prev_hash_value) {
- enumerate.Add(++global_index, top_vocab_file->Word());
- }
- vocab.InsertUniversalIdx(top_vocab_file->ModelNum(),
- top_vocab_file->CurrentIndex(),
- global_index);
-
- prev_hash_value = top_vocab_file->Value();
-
- heap.pop();
- if (++(*top_vocab_file)) {
- heap.push(top_vocab_file);
- }
- }
- return global_index + 1;
-}
-
-} // namespace interpolate
-} // namespace lm
-
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_vocab.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/merge_vocab.hh b/ext/kenlm/lm/interpolate/merge_vocab.hh
deleted file mode 100644
index cc74d33..0000000
--- a/ext/kenlm/lm/interpolate/merge_vocab.hh
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef LM_INTERPOLATE_MERGE_VOCAB_H
-#define LM_INTERPOLATE_MERGE_VOCAB_H
-
-#include "lm/word_index.hh"
-#include "util/file.hh"
-#include "util/fixed_array.hh"
-
-namespace lm {
-
-class EnumerateVocab;
-
-namespace interpolate {
-
-class UniversalVocab;
-
-// Takes ownership of vocab_files.
-// The combined vocabulary is enumerated with enumerate.
-// Returns the size of the combined vocabulary.
-WordIndex MergeVocab(util::FixedArray<util::scoped_fd> &vocab_files, UniversalVocab &vocab, EnumerateVocab &enumerate);
-
-}} // namespaces
-
-#endif // LM_INTERPOLATE_MERGE_VOCAB_H
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/merge_vocab_test.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/merge_vocab_test.cc b/ext/kenlm/lm/interpolate/merge_vocab_test.cc
deleted file mode 100644
index 6df25b2..0000000
--- a/ext/kenlm/lm/interpolate/merge_vocab_test.cc
+++ /dev/null
@@ -1,126 +0,0 @@
-#define BOOST_TEST_MODULE InterpolateMergeVocabTest
-#include <boost/test/unit_test.hpp>
-
-#include "lm/enumerate_vocab.hh"
-#include "lm/interpolate/merge_vocab.hh"
-#include "lm/interpolate/universal_vocab.hh"
-#include "lm/lm_exception.hh"
-#include "lm/vocab.hh"
-#include "lm/word_index.hh"
-#include "util/file.hh"
-#include "util/file_piece.hh"
-
-#include <cstring>
-
-namespace lm {
-namespace interpolate {
-namespace {
-
-// Stupid bjam permutes the command line arguments randomly.
-class TestFiles {
- public:
- TestFiles() {
- char **argv = boost::unit_test::framework::master_test_suite().argv;
- int argc = boost::unit_test::framework::master_test_suite().argc;
- BOOST_REQUIRE_EQUAL(6, argc);
- for (int i = 1; i < argc; ++i) {
- EndsWithAssign(argv[i], "test1", test[0]);
- EndsWithAssign(argv[i], "test2", test[1]);
- EndsWithAssign(argv[i], "test3", test[2]);
- EndsWithAssign(argv[i], "no_unk", no_unk);
- EndsWithAssign(argv[i], "bad_order", bad_order);
- }
- }
-
- void EndsWithAssign(char *arg, StringPiece value, util::scoped_fd &to) {
- StringPiece str(arg);
- if (str.size() < value.size()) return;
- if (std::memcmp(str.data() + str.size() - value.size(), value.data(), value.size())) return;
- to.reset(util::OpenReadOrThrow(arg));
- }
-
- util::scoped_fd test[3], no_unk, bad_order;
-};
-
-class DoNothingEnumerate : public EnumerateVocab {
- public:
- void Add(WordIndex, const StringPiece &) {}
-};
-
-BOOST_AUTO_TEST_CASE(MergeVocabTest) {
- TestFiles files;
-
- util::FixedArray<util::scoped_fd> used_files(3);
- used_files.push_back(files.test[0].release());
- used_files.push_back(files.test[1].release());
- used_files.push_back(files.test[2].release());
-
- std::vector<lm::WordIndex> model_max_idx;
- model_max_idx.push_back(10);
- model_max_idx.push_back(10);
- model_max_idx.push_back(10);
-
- util::scoped_fd combined(util::MakeTemp("temporary"));
-
- UniversalVocab universal_vocab(model_max_idx);
- {
- ngram::ImmediateWriteWordsWrapper writer(NULL, combined.get(), 0);
- MergeVocab(used_files, universal_vocab, writer);
- }
-
- BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(0, 0), 0);
- BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(1, 0), 0);
- BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(2, 0), 0);
- BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(0, 1), 1);
- BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(1, 1), 2);
- BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(2, 1), 8);
- BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(0, 5), 11);
- BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(1, 3), 4);
- BOOST_CHECK_EQUAL(universal_vocab.GetUniversalIdx(2, 3), 10);
-
- util::FilePiece f(combined.release());
- BOOST_CHECK_EQUAL("<unk>", f.ReadLine('\0'));
- BOOST_CHECK_EQUAL("a", f.ReadLine('\0'));
- BOOST_CHECK_EQUAL("is this", f.ReadLine('\0'));
- BOOST_CHECK_EQUAL("this a", f.ReadLine('\0'));
- BOOST_CHECK_EQUAL("first cut", f.ReadLine('\0'));
- BOOST_CHECK_EQUAL("this", f.ReadLine('\0'));
- BOOST_CHECK_EQUAL("a first", f.ReadLine('\0'));
- BOOST_CHECK_EQUAL("cut", f.ReadLine('\0'));
- BOOST_CHECK_EQUAL("is", f.ReadLine('\0'));
- BOOST_CHECK_EQUAL("i", f.ReadLine('\0'));
- BOOST_CHECK_EQUAL("secd", f.ReadLine('\0'));
- BOOST_CHECK_EQUAL("first", f.ReadLine('\0'));
- BOOST_CHECK_THROW(f.ReadLine('\0'), util::EndOfFileException);
-}
-
-BOOST_AUTO_TEST_CASE(MergeVocabNoUnkTest) {
- TestFiles files;
- util::FixedArray<util::scoped_fd> used_files(1);
- used_files.push_back(files.no_unk.release());
-
- std::vector<lm::WordIndex> model_max_idx;
- model_max_idx.push_back(10);
-
- UniversalVocab universal_vocab(model_max_idx);
- DoNothingEnumerate nothing;
- BOOST_CHECK_THROW(MergeVocab(used_files, universal_vocab, nothing), FormatLoadException);
-}
-
-BOOST_AUTO_TEST_CASE(MergeVocabWrongOrderTest) {
- TestFiles files;
-
- util::FixedArray<util::scoped_fd> used_files(2);
- used_files.push_back(files.test[0].release());
- used_files.push_back(files.bad_order.release());
-
- std::vector<lm::WordIndex> model_max_idx;
- model_max_idx.push_back(10);
- model_max_idx.push_back(10);
-
- lm::interpolate::UniversalVocab universal_vocab(model_max_idx);
- DoNothingEnumerate nothing;
- BOOST_CHECK_THROW(MergeVocab(used_files, universal_vocab, nothing), FormatLoadException);
-}
-
-}}} // namespaces
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/normalize.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/normalize.cc b/ext/kenlm/lm/interpolate/normalize.cc
deleted file mode 100644
index f683f10..0000000
--- a/ext/kenlm/lm/interpolate/normalize.cc
+++ /dev/null
@@ -1,384 +0,0 @@
-#include "lm/interpolate/normalize.hh"
-
-#include "lm/common/compare.hh"
-#include "lm/common/ngram_stream.hh"
-#include "lm/interpolate/backoff_matrix.hh"
-#include "lm/interpolate/bounded_sequence_encoding.hh"
-#include "lm/interpolate/interpolate_info.hh"
-#include "lm/interpolate/merge_probabilities.hh"
-#include "lm/weights.hh"
-#include "lm/word_index.hh"
-#include "util/fixed_array.hh"
-#include "util/scoped.hh"
-#include "util/stream/stream.hh"
-#include "util/stream/rewindable_stream.hh"
-
-#include <functional>
-#include <queue>
-#include <vector>
-
-namespace lm { namespace interpolate {
-namespace {
-
-struct SuffixLexicographicLess : public std::binary_function<NGramHeader, NGramHeader, bool> {
- bool operator()(const NGramHeader first, const NGramHeader second) const {
- for (const WordIndex *f = first.end() - 1, *s = second.end() - 1; f >= first.begin() && s >= second.begin(); --f, --s) {
- if (*f < *s) return true;
- if (*f > *s) return false;
- }
- return first.size() < second.size();
- }
-};
-
-class BackoffQueueEntry {
- public:
- BackoffQueueEntry(float &entry, const util::stream::ChainPosition &position)
- : entry_(entry), stream_(position) {
- entry_ = 0.0;
- }
-
- operator bool() const { return stream_; }
-
- NGramHeader operator*() const { return *stream_; }
- const NGramHeader *operator->() const { return &*stream_; }
-
- void Enter() {
- entry_ = stream_->Value().backoff;
- }
-
- BackoffQueueEntry &Next() {
- entry_ = 0.0;
- ++stream_;
- return *this;
- }
-
- private:
- float &entry_;
- NGramStream<ProbBackoff> stream_;
-};
-
-struct PtrGreater : public std::binary_function<const BackoffQueueEntry *, const BackoffQueueEntry *, bool> {
- bool operator()(const BackoffQueueEntry *first, const BackoffQueueEntry *second) const {
- return SuffixLexicographicLess()(**second, **first);
- }
-};
-
-class EntryOwner : public util::FixedArray<BackoffQueueEntry> {
- public:
- void push_back(float &entry, const util::stream::ChainPosition &position) {
- new (end()) BackoffQueueEntry(entry, position);
- Constructed();
- }
-};
-
-std::size_t MaxOrder(const util::FixedArray<util::stream::ChainPositions> &model) {
- std::size_t ret = 0;
- for (const util::stream::ChainPositions *m = model.begin(); m != model.end(); ++m) {
- ret = std::max(ret, m->size());
- }
- return ret;
-}
-
-class BackoffManager {
- public:
- explicit BackoffManager(const util::FixedArray<util::stream::ChainPositions> &models)
- : entered_(MaxOrder(models)), matrix_(models.size(), MaxOrder(models)), skip_write_(MaxOrder(models)) {
- std::size_t total = 0;
- for (const util::stream::ChainPositions *m = models.begin(); m != models.end(); ++m) {
- total += m->size();
- }
- for (std::size_t i = 0; i < MaxOrder(models); ++i) {
- entered_.push_back(models.size());
- }
- owner_.Init(total);
- for (const util::stream::ChainPositions *m = models.begin(); m != models.end(); ++m) {
- for (const util::stream::ChainPosition *j = m->begin(); j != m->end(); ++j) {
- owner_.push_back(matrix_.Backoff(m - models.begin(), j - m->begin()), *j);
- if (owner_.back()) {
- queue_.push(&owner_.back());
- }
- }
- }
- }
-
- void SetupSkip(std::size_t order, util::stream::Stream &stream) {
- skip_write_[order - 2] = &stream;
- }
-
- // Move up the backoffs for the given n-gram. The n-grams must be provided
- // in suffix lexicographic order.
- void Enter(const NGramHeader &to) {
- // Check that we exited properly.
- for (std::size_t i = to.Order() - 1; i < entered_.size(); ++i) {
- assert(entered_[i].empty());
- }
- SuffixLexicographicLess less;
- while (!queue_.empty() && less(**queue_.top(), to))
- SkipRecord();
- while (TopMatches(to)) {
- BackoffQueueEntry *matches = queue_.top();
- entered_[to.Order() - 1].push_back(matches);
- matches->Enter();
- queue_.pop();
- }
- }
-
- void Exit(std::size_t order_minus_1) {
- for (BackoffQueueEntry **i = entered_[order_minus_1].begin(); i != entered_[order_minus_1].end(); ++i) {
- if ((*i)->Next())
- queue_.push(*i);
- }
- entered_[order_minus_1].clear();
- }
-
- float Get(std::size_t model, std::size_t order_minus_1) const {
- return matrix_.Backoff(model, order_minus_1);
- }
-
- void Finish() {
- while (!queue_.empty())
- SkipRecord();
- }
-
- private:
- void SkipRecord() {
- BackoffQueueEntry *top = queue_.top();
- queue_.pop();
- // Is this the last instance of the n-gram?
- if (!TopMatches(**top)) {
- // An n-gram is being skipped. Called once per skipped n-gram,
- // regardless of how many models it comes from.
- *reinterpret_cast<float*>(skip_write_[(*top)->Order() - 1]->Get()) = 0.0;
- ++*skip_write_[(*top)->Order() - 1];
- }
- if (top->Next())
- queue_.push(top);
- }
-
- bool TopMatches(const NGramHeader &header) const {
- return !queue_.empty() && (*queue_.top())->Order() == header.Order() && std::equal(header.begin(), header.end(), (*queue_.top())->begin());
- }
-
- EntryOwner owner_;
- std::priority_queue<BackoffQueueEntry*, std::vector<BackoffQueueEntry*>, PtrGreater> queue_;
-
- // Indexed by order then just all the matching models.
- util::FixedArray<util::FixedArray<BackoffQueueEntry*> > entered_;
-
- std::size_t order_;
-
- BackoffMatrix matrix_;
-
- std::vector<util::stream::Stream*> skip_write_;
-};
-
-typedef long double Accum;
-
-// Handles n-grams of the same order, using recursion to call another instance
-// for higher orders.
-class Recurse {
- public:
- Recurse(
- const InterpolateInfo &info, // Must stay alive the entire time.
- std::size_t order,
- const util::stream::ChainPosition &merged_probs,
- const util::stream::ChainPosition &prob_out,
- const util::stream::ChainPosition &backoff_out,
- BackoffManager &backoffs,
- Recurse *higher) // higher is null for the highest order.
- : order_(order),
- encoding_(MakeEncoder(info, order)),
- input_(merged_probs, PartialProbGamma(order, encoding_.EncodedLength())),
- prob_out_(prob_out),
- backoff_out_(backoff_out),
- backoffs_(backoffs),
- lambdas_(&*info.lambdas.begin()),
- higher_(higher),
- decoded_backoffs_(info.Models()),
- extended_context_(order - 1) {
- // This is only for bigrams and above. Summing unigrams is a much easier case.
- assert(order >= 2);
- }
-
- // context = w_1^{n-1}
- // z_lower = Z(w_2^{n-1})
- // Input:
- // Merged probabilities without backoff applied in input_.
- // Backoffs via backoffs_.
- // Calculates:
- // Z(w_1^{n-1}): intermediate only.
- // p_I(x | w_1^{n-1}) for all x: w_1^{n-1}x exists: Written to prob_out_.
- // b_I(w_1^{n-1}): Written to backoff_out_.
- void SameContext(const NGramHeader &context, Accum z_lower) {
- assert(context.size() == order_ - 1);
- backoffs_.Enter(context);
- prob_out_.Mark();
-
- // This is the backoff term that applies when one assumes everything backs off:
- // \prod_i b_i(w_1^{n-1})^{\lambda_i}.
- Accum backoff_once = 0.0;
- for (std::size_t m = 0; m < decoded_backoffs_.size(); ++m) {
- backoff_once += lambdas_[m] * backoffs_.Get(m, order_ - 2);
- }
-
- Accum z_delta = 0.0;
- std::size_t count = 0;
- for (; input_ && std::equal(context.begin(), context.end(), input_->begin()); ++input_, ++prob_out_, ++count) {
- // Apply backoffs to probabilities.
- // TODO: change bounded sequence encoding to have an iterator for decoding instead of doing a copy here.
- encoding_.Decode(input_->FromBegin(), &*decoded_backoffs_.begin());
- for (std::size_t m = 0; m < NumModels(); ++m) {
- // Apply the backoffs as instructed for model m.
- float accumulated = 0.0;
- // Change backoffs for [order it backed off to, order - 1) except
- // with 0-indexing. There is still the potential to charge backoff
- // for order - 1, which is done later. The backoffs charged here
- // are b_m(w_{n-1}^{n-1}) ... b_m(w_2^{n-1})
- for (unsigned char backed_to = decoded_backoffs_[m]; backed_to < order_ - 2; ++backed_to) {
- accumulated += backoffs_.Get(m, backed_to);
- }
- float lambda = lambdas_[m];
- // Lower p(x | w_2^{n-1}) gets all the backoffs except the highest.
- input_->LowerProb() += accumulated * lambda;
- // Charge the backoff b(w_1^{n-1}) if applicable, but only to attain p(x | w_1^{n-1})
- if (decoded_backoffs_[m] < order_ - 1) {
- accumulated += backoffs_.Get(m, order_ - 2);
- }
- input_->Prob() += accumulated * lambda;
- }
- // TODO: better precision/less operations here.
- z_delta += pow(10.0, input_->Prob()) - pow(10.0, input_->LowerProb() + backoff_once);
-
- // Write unnormalized probability record.
- std::copy(input_->begin(), input_->end(), reinterpret_cast<WordIndex*>(prob_out_.Get()));
- ProbWrite() = input_->Prob();
- }
- // TODO numerical precision.
- Accum z = log10(pow(10.0, z_lower + backoff_once) + z_delta);
-
- // Normalize.
- prob_out_.Rewind();
- for (std::size_t i = 0; i < count; ++i, ++prob_out_) {
- ProbWrite() -= z;
- }
- // This allows the stream to release data.
- prob_out_.Mark();
-
- // Output backoff.
- *reinterpret_cast<float*>(backoff_out_.Get()) = z_lower + backoff_once - z;
- ++backoff_out_;
-
- if (higher_.get())
- higher_->ExtendContext(context, z);
-
- backoffs_.Exit(order_ - 2);
- }
-
- // Call is given a context and z(context).
- // Evaluates y context x for all y,x.
- void ExtendContext(const NGramHeader &middle, Accum z_lower) {
- assert(middle.size() == order_ - 2);
- // Copy because the input will advance. TODO avoid this copy by sharing amongst classes.
- std::copy(middle.begin(), middle.end(), extended_context_.begin() + 1);
- while (input_ && std::equal(middle.begin(), middle.end(), input_->begin() + 1)) {
- *extended_context_.begin() = *input_->begin();
- SameContext(NGramHeader(&*extended_context_.begin(), order_ - 1), z_lower);
- }
- }
-
- void Finish() {
- assert(!input_);
- prob_out_.Poison();
- backoff_out_.Poison();
- if (higher_.get())
- higher_->Finish();
- }
-
- // The BackoffManager class also injects backoffs when it skips ahead e.g. b(</s>) = 1
- util::stream::Stream &BackoffStream() { return backoff_out_; }
-
- private:
- // Write the probability to the correct place in prob_out_. Should use a proxy but currently incompatible with RewindableStream.
- float &ProbWrite() {
- return *reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(prob_out_.Get()) + order_ * sizeof(WordIndex));
- }
-
- std::size_t NumModels() const { return decoded_backoffs_.size(); }
-
- const std::size_t order_;
-
- const BoundedSequenceEncoding encoding_;
-
- ProxyStream<PartialProbGamma> input_;
- util::stream::RewindableStream prob_out_;
- util::stream::Stream backoff_out_;
-
- BackoffManager &backoffs_;
- const float *const lambdas_;
-
- // Higher order instance of this same class.
- util::scoped_ptr<Recurse> higher_;
-
- // Temporary in SameContext.
- std::vector<unsigned char> decoded_backoffs_;
- // Temporary in ExtendContext.
- std::vector<WordIndex> extended_context_;
-};
-
-class Thread {
- public:
- Thread(const InterpolateInfo &info, util::FixedArray<util::stream::ChainPositions> &models_by_order, util::stream::Chains &prob_out, util::stream::Chains &backoff_out)
- : info_(info), models_by_order_(models_by_order), prob_out_(prob_out), backoff_out_(backoff_out) {}
-
- void Run(const util::stream::ChainPositions &merged_probabilities) {
- // Unigrams do not have enocded backoff info.
- ProxyStream<PartialProbGamma> in(merged_probabilities[0], PartialProbGamma(1, 0));
- util::stream::RewindableStream prob_write(prob_out_[0]);
- Accum z = 0.0;
- prob_write.Mark();
- WordIndex count = 0;
- for (; in; ++in, ++prob_write, ++count) {
- // Note assumption that probabilitity comes first
- memcpy(prob_write.Get(), in.Get(), sizeof(WordIndex) + sizeof(float));
- z += pow(10.0, in->Prob());
- }
- // TODO HACK TODO: lmplz outputs p(<s>) = 1 to get q to compute nicely. That will always result in 1.0 more than it should be.
- z -= 1.0;
- float log_z = log10(z);
- prob_write.Rewind();
- // Normalize unigram probabilities.
- for (WordIndex i = 0; i < count; ++i, ++prob_write) {
- *reinterpret_cast<float*>(reinterpret_cast<uint8_t*>(prob_write.Get()) + sizeof(WordIndex)) -= log_z;
- }
- prob_write.Poison();
-
- // Now setup the higher orders.
- util::scoped_ptr<Recurse> higher_order;
- BackoffManager backoffs(models_by_order_);
- std::size_t max_order = merged_probabilities.size();
- for (std::size_t order = max_order; order >= 2; --order) {
- higher_order.reset(new Recurse(info_, order, merged_probabilities[order - 1], prob_out_[order - 1], backoff_out_[order - 2], backoffs, higher_order.release()));
- backoffs.SetupSkip(order, higher_order->BackoffStream());
- }
- if (max_order > 1) {
- higher_order->ExtendContext(NGramHeader(NULL, 0), log_z);
- higher_order->Finish();
- }
- }
-
- private:
- const InterpolateInfo info_;
- util::FixedArray<util::stream::ChainPositions> &models_by_order_;
- util::stream::ChainPositions prob_out_;
- util::stream::ChainPositions backoff_out_;
-};
-
-} // namespace
-
-void Normalize(const InterpolateInfo &info, util::FixedArray<util::stream::ChainPositions> &models_by_order, util::stream::Chains &merged_probabilities, util::stream::Chains &prob_out, util::stream::Chains &backoff_out) {
- assert(prob_out.size() == backoff_out.size() + 1);
- // Arbitrarily put the thread on the merged_probabilities Chains.
- merged_probabilities >> Thread(info, models_by_order, prob_out, backoff_out);
-}
-
-}} // namespaces
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/normalize.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/normalize.hh b/ext/kenlm/lm/interpolate/normalize.hh
deleted file mode 100644
index dd26b25..0000000
--- a/ext/kenlm/lm/interpolate/normalize.hh
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef LM_INTERPOLATE_NORMALIZE_H
-#define LM_INTERPOLATE_NORMALIZE_H
-
-#include "util/fixed_array.hh"
-
-/* Pass 2:
- * - Multiply backoff weights by the backed off probabilities from pass 1.
- * - Compute the normalization factor Z.
- * - Send Z to the next highest order.
- * - Rewind and divide by Z.
- */
-
-namespace util { namespace stream {
-class ChainPositions;
-class Chains;
-}} // namespaces
-
-namespace lm { namespace interpolate {
-
-struct InterpolateInfo;
-
-void Normalize(
- const InterpolateInfo &info,
- // Input full models for backoffs. Assumes that renumbering has been done. Suffix order.
- util::FixedArray<util::stream::ChainPositions> &models_by_order,
- // Input PartialProbGamma from MergeProbabilities. Context order.
- util::stream::Chains &merged_probabilities,
- // Output NGram<float> with normalized probabilities. Context order.
- util::stream::Chains &probabilities_out,
- // Output bare floats with backoffs. Note backoffs.size() == order - 1. Suffix order.
- util::stream::Chains &backoffs_out);
-
-}} // namespaces
-
-#endif // LM_INTERPOLATE_NORMALIZE_H
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/normalize_test.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/normalize_test.cc b/ext/kenlm/lm/interpolate/normalize_test.cc
deleted file mode 100644
index fe220f3..0000000
--- a/ext/kenlm/lm/interpolate/normalize_test.cc
+++ /dev/null
@@ -1,86 +0,0 @@
-#include "lm/interpolate/normalize.hh"
-
-#include "lm/interpolate/interpolate_info.hh"
-#include "lm/interpolate/merge_probabilities.hh"
-#include "lm/common/ngram_stream.hh"
-#include "util/stream/chain.hh"
-#include "util/stream/multi_stream.hh"
-
-#define BOOST_TEST_MODULE NormalizeTest
-#include <boost/test/unit_test.hpp>
-
-namespace lm { namespace interpolate { namespace {
-
-// log without backoff
-const float kInputs[] = {-0.3, 1.2, -9.8, 4.0, -7.0, 0.0};
-
-class WriteInput {
- public:
- WriteInput() {}
- void Run(const util::stream::ChainPosition &to) {
- util::stream::Stream out(to);
- for (WordIndex i = 0; i < sizeof(kInputs) / sizeof(float); ++i, ++out) {
- memcpy(out.Get(), &i, sizeof(WordIndex));
- memcpy((uint8_t*)out.Get() + sizeof(WordIndex), &kInputs[i], sizeof(float));
- }
- out.Poison();
- }
-};
-
-void CheckOutput(const util::stream::ChainPosition &from) {
- NGramStream<float> in(from);
- float sum = 0.0;
- for (WordIndex i = 0; i < sizeof(kInputs) / sizeof(float) - 1 /* <s> at the end */; ++i) {
- sum += pow(10.0, kInputs[i]);
- }
- sum = log10(sum);
- BOOST_REQUIRE(in);
- BOOST_CHECK_CLOSE(kInputs[0] - sum, in->Value(), 0.0001);
- BOOST_REQUIRE(++in);
- BOOST_CHECK_CLOSE(kInputs[1] - sum, in->Value(), 0.0001);
- BOOST_REQUIRE(++in);
- BOOST_CHECK_CLOSE(kInputs[2] - sum, in->Value(), 0.0001);
- BOOST_REQUIRE(++in);
- BOOST_CHECK_CLOSE(kInputs[3] - sum, in->Value(), 0.0001);
- BOOST_REQUIRE(++in);
- BOOST_CHECK_CLOSE(kInputs[4] - sum, in->Value(), 0.0001);
- BOOST_REQUIRE(++in);
- BOOST_CHECK_CLOSE(kInputs[5] - sum, in->Value(), 0.0001);
- BOOST_CHECK(!++in);
-}
-
-BOOST_AUTO_TEST_CASE(Unigrams) {
- InterpolateInfo info;
- info.lambdas.push_back(2.0);
- info.lambdas.push_back(-0.1);
- info.orders.push_back(1);
- info.orders.push_back(1);
-
- BOOST_CHECK_EQUAL(0, MakeEncoder(info, 1).EncodedLength());
-
- // No backoffs.
- util::stream::Chains blank(0);
- util::FixedArray<util::stream::ChainPositions> models_by_order(2);
- models_by_order.push_back(blank);
- models_by_order.push_back(blank);
-
- util::stream::Chains merged_probabilities(1);
- util::stream::Chains probabilities_out(1);
- util::stream::Chains backoffs_out(0);
-
- merged_probabilities.push_back(util::stream::ChainConfig(sizeof(WordIndex) + sizeof(float) + sizeof(float), 2, 24));
- probabilities_out.push_back(util::stream::ChainConfig(sizeof(WordIndex) + sizeof(float), 2, 100));
-
- merged_probabilities[0] >> WriteInput();
- Normalize(info, models_by_order, merged_probabilities, probabilities_out, backoffs_out);
-
- util::stream::ChainPosition checker(probabilities_out[0].Add());
-
- merged_probabilities >> util::stream::kRecycle;
- probabilities_out >> util::stream::kRecycle;
-
- CheckOutput(checker);
- probabilities_out.Wait();
-}
-
-}}} // namespaces
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/perf_enum_gv_main.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/perf_enum_gv_main.cc b/ext/kenlm/lm/interpolate/perf_enum_gv_main.cc
deleted file mode 100644
index a68e9e9..0000000
--- a/ext/kenlm/lm/interpolate/perf_enum_gv_main.cc
+++ /dev/null
@@ -1,215 +0,0 @@
-/*
-Usage example
-1) Download from http://www.gwinnup.org/lminterp/train-params-output.tar.bz2
-2) then run perf_enum_gv -t lm.en.dev -m model-a.3.srilm -m model-b.3.srilm -m model-c.3.srilm
- */
-
-#include "lm/ngram_query.hh"
-#include "lm/model.hh"
-#include "lm/word_index.hh"
-#include "lm/interpolate/enumerate_global_vocab.hh"
-
-#include "util/fixed_array.hh"
-#include "util/usage.hh"
-
-#include <string>
-#include <vector>
-#include <iostream>
-#include <fstream>
-#include <map>
-
-#include <boost/program_options.hpp>
-#include <boost/version.hpp>
-#include <boost/foreach.hpp>
-
-#include <Eigen/Eigen>
-
-#include <iostream>
-#include <sys/time.h>
-
-inline double deltaTV(const timeval& s, const timeval& e)
-{
- return (e.tv_sec - s.tv_sec)*1000.0 + (e.tv_usec - s.tv_usec)/1000.0;
-}
-
-typedef struct timeval Wall;
-Wall GetWall() {
- struct timeval tv;
- gettimeofday(&tv, NULL);
- return tv;
-}
-
-typedef Eigen::MatrixXf FMatrix;
-typedef Eigen::VectorXf FVector;
-
-
-bool HAS_BIAS = true;
-
-using namespace lm::ngram;
-using namespace lm;
-
-inline void logProb(Model * model, const std::vector<std::string>& ctx, const std::string& word) {
-
- // Horribly inefficient
- const Vocabulary &vocab = model->GetVocabulary();
-
- State nextState; //throwaway
-
- WordIndex word_idx = vocab.Index(word);
- WordIndex context_idx[ctx.size()];
-
- //reverse context
- for(unsigned int i = 0; i < ctx.size(); i++) {
- context_idx[ctx.size() - 1 - i] = vocab.Index(ctx[i]);
- }
- FullScoreReturn score = model->FullScoreForgotState(context_idx, &(context_idx[ctx.size() -1]), word_idx, nextState);
-}
-
-void set_features(const std::vector<std::string>& ctx,
- const std::string& word,
- const std::vector<Model *>& models,
- FVector& v) {
-
- for (unsigned i=0; i < models.size(); ++i)
- logProb(models[i], ctx, word);
-
-}
-
-//const util::FixedArray<Model *>& models)
-void train_params(
- const std::vector<std::vector<std::string> >& corpus,
- const std::vector<std::string>& vocab,
- const std::vector<Model *>& models) {
- using namespace std;
-
- vector<string> context(5, "<s>");
- const int ITERATIONS = 10;
- const int nlambdas = models.size(); // #models
- FVector params = FVector::Zero(nlambdas);
- vector<FVector> feats(vocab.size(), params);
- static Wall start,stop;
-
- for (int iter = 0; iter < ITERATIONS; ++iter) { // iterations
- std::cout << "iteration: " << iter
- << " corpus size " << corpus.size()
- << std::endl;
- for (unsigned ci = 0; ci < corpus.size(); ++ci) { // sentences in tuning corpus
- const vector<string>& sentence = corpus[ci];
- context.resize(5);
- for (unsigned t = 0; t < sentence.size(); ++t) { // words in sentence
- std::cout << "sentence " << ci << " word " << t << std::endl;
- start = GetWall();
- const string& ref_word_string = sentence[t];
- for (unsigned i = 0; i < vocab.size(); ++i) { // vocab
- set_features(context, vocab[i], models, feats[i]);
- }
- stop = GetWall();
- std::cout << " time elapsed = " << deltaTV(start,stop) << std::endl;
- context.push_back(ref_word_string);
- }
- }
- }
-}
-
-int main(int argc, char** argv) {
-
- std::string tuning_data;
- std::vector<std::string> lms;
-
- try {
- namespace po = boost::program_options;
- po::options_description options("train-params");
-
- options.add_options()
- ("help,h", po::bool_switch(), "Show this help message")
- ("no_bias_term,B", po::bool_switch(), "Do not include a 'bias' feature")
- ("tuning_data,t", po::value<std::string>(&tuning_data), "File to tune perplexity on")
- ("model,m", po::value<std::vector<std::string> >(&lms), "Language models in KenLM format to interpolate");
- po::variables_map vm;
- po::store(po::parse_command_line(argc, argv, options), vm);
-
- // Display help
- if(argc == 1 || vm["help"].as<bool>()) {
- std::cerr << options << std::endl;
- return 1;
- }
- if (vm["no_bias_term"].as<bool>())
- HAS_BIAS = false;
- lms = vm["model"].as<std::vector<std::string> >();
- tuning_data = vm["tuning_data"].as<std::string>();
- }
- catch(const std::exception &e) {
-
- std::cerr << e.what() << std::endl;
- return 1;
-
- }
- if (lms.size() < 2) {
- std::cerr << "Please specify at least two language model files with -m LM.KLM\n";
- return 1;
- }
- if (tuning_data.empty()) {
- std::cerr << "Please specify tuning set with -t FILE.TXT\n";
- return 1;
- }
-
- std::map<std::string, int*> vmap;
- util::FixedArray<WordIndex> vm(2);
-
- //stuff it into the
- EnumerateGlobalVocab * globalVocabBuilder = new EnumerateGlobalVocab(&vmap, lms.size());
- // EnumerateGlobalVocab * globalVocabBuilder = new EnumerateGlobalVocab(vm);
-
- Config cfg;
- cfg.enumerate_vocab = (EnumerateVocab *) globalVocabBuilder;
-
- //load models
- //util::FixedArray<Model *> models(lms.size());
- std::vector<Model *> models;
- for(int i=0; i < lms.size(); i++) {
- std::cerr << "Loading LM file: " << lms[i] << std::endl;
-
- //haaaack
- globalVocabBuilder->SetCurModel(i); //yes this is dumb
-
- //models[i] = new Model(lms[i].c_str());
- Model * this_model = new Model(lms[i].c_str(), cfg);
- models.push_back( this_model );
-
- }
-
- //assemble vocabulary vector
- std::vector<std::string> vocab;
- std::cerr << "Global Vocab Map has size: " << vmap.size() << std::endl;
-
- std::pair<StringPiece,int *> me;
-
- for(std::map<std::string, int*>::iterator iter = vmap.begin(); iter != vmap.end(); ++iter) {
- vocab.push_back(iter->first);
- }
- std::cerr << "Vocab vector has size: " << vocab.size() << std::endl;
-
- //load context sorted ngrams into vector of vectors
- std::vector<std::vector<std::string> > corpus;
-
- std::cerr << "Loading context-sorted ngrams: " << tuning_data << std::endl;
- std::ifstream infile(tuning_data);
-
- for(std::string line; std::getline(infile, line); ) {
-
- std::vector<std::string> words; {
-
- std::stringstream stream(line);
- std::string word;
-
- while(stream >> word) {
- words.push_back(word);
- }
- }
- corpus.push_back(words);
- }
-
- train_params(corpus, vocab, models);
-
- return 0;
-}