You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/19 21:33:58 UTC
[10/51] [partial] incubator-joshua git commit: Converted KenLM into a
submodule
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_instance_data/toy1.2
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/tune_instance_data/toy1.2 b/ext/kenlm/lm/interpolate/tune_instance_data/toy1.2
deleted file mode 100644
index 58d28a0..0000000
Binary files a/ext/kenlm/lm/interpolate/tune_instance_data/toy1.2 and /dev/null differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_instance_data/toy1.3
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/tune_instance_data/toy1.3 b/ext/kenlm/lm/interpolate/tune_instance_data/toy1.3
deleted file mode 100644
index 1a63afe..0000000
Binary files a/ext/kenlm/lm/interpolate/tune_instance_data/toy1.3 and /dev/null differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_instance_data/toy1.kenlm_intermediate
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/tune_instance_data/toy1.kenlm_intermediate b/ext/kenlm/lm/interpolate/tune_instance_data/toy1.kenlm_intermediate
deleted file mode 100644
index fe82667..0000000
--- a/ext/kenlm/lm/interpolate/tune_instance_data/toy1.kenlm_intermediate
+++ /dev/null
@@ -1,3 +0,0 @@
-KenLM intermediate binary file
-Counts 6 7 6
-Payload pb
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_instance_data/toy1.vocab
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/tune_instance_data/toy1.vocab b/ext/kenlm/lm/interpolate/tune_instance_data/toy1.vocab
deleted file mode 100644
index 763b2af..0000000
Binary files a/ext/kenlm/lm/interpolate/tune_instance_data/toy1.vocab and /dev/null differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_instance_test.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/tune_instance_test.cc b/ext/kenlm/lm/interpolate/tune_instance_test.cc
deleted file mode 100644
index a0db59c..0000000
--- a/ext/kenlm/lm/interpolate/tune_instance_test.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-#include "lm/interpolate/tune_instance.hh"
-
-#include "util/file_stream.hh"
-#include "util/file.hh"
-#include "util/string_piece.hh"
-
-#define BOOST_TEST_MODULE InstanceTest
-#include <boost/test/unit_test.hpp>
-
-#include <iostream>
-
-#include <vector>
-
-namespace lm { namespace interpolate { namespace {
-
-Matrix::Index FindRow(const std::vector<WordIndex> &words, WordIndex word) {
- std::vector<WordIndex>::const_iterator it = std::find(words.begin(), words.end(), word);
- BOOST_REQUIRE(it != words.end());
- return it - words.begin();
-}
-
-BOOST_AUTO_TEST_CASE(Toy) {
- util::scoped_fd test_input(util::MakeTemp("temporary"));
- {
- util::FileStream(test_input.get()) << "c\n";
- }
-
- StringPiece dir("tune_instance_data/");
- if (boost::unit_test::framework::master_test_suite().argc == 2) {
- StringPiece zero_file(boost::unit_test::framework::master_test_suite().argv[1]);
- BOOST_REQUIRE(zero_file.size() > strlen("toy0.1"));
- BOOST_REQUIRE_EQUAL("toy0.1", StringPiece(zero_file.data() + zero_file.size() - 6, 6));
- dir = StringPiece(zero_file.data(), zero_file.size() - 6);
- }
-
- std::vector<StringPiece> model_names;
- std::string full0 = std::string(dir.data(), dir.size()) + "toy0";
- std::string full1 = std::string(dir.data(), dir.size()) + "toy1";
- model_names.push_back(full0);
- model_names.push_back(full1);
-
- util::FixedArray<Instance> instances;
- Matrix ln_unigrams;
- // Returns vocab id of <s>
- BOOST_CHECK_EQUAL(1, LoadInstances(test_input.release(), model_names, instances, ln_unigrams));
- // <unk>
- BOOST_CHECK_CLOSE(-0.90309 * M_LN10, ln_unigrams(0, 0), 0.001);
- BOOST_CHECK_CLOSE(-1 * M_LN10, ln_unigrams(0, 1), 0.001);
- // <s>
- BOOST_CHECK_GT(-98.0, ln_unigrams(1, 0));
- BOOST_CHECK_GT(-98.0, ln_unigrams(1, 1));
- // a
- BOOST_CHECK_CLOSE(-0.46943438 * M_LN10, ln_unigrams(2, 0), 0.001);
- BOOST_CHECK_CLOSE(-0.6146491 * M_LN10, ln_unigrams(2, 1), 0.001);
- // </s>
- BOOST_CHECK_CLOSE(-0.5720968 * M_LN10, ln_unigrams(3, 0), 0.001);
- BOOST_CHECK_CLOSE(-0.6146491 * M_LN10, ln_unigrams(3, 1), 0.001);
- // c
- BOOST_CHECK_CLOSE(-0.90309 * M_LN10, ln_unigrams(4, 0), 0.001); // <unk>
- BOOST_CHECK_CLOSE(-0.7659168 * M_LN10, ln_unigrams(4, 1), 0.001);
- // too lazy to do b.
-
- // Two instances:
- // <s> predicts c
- // <s> c predicts </s>
- BOOST_REQUIRE_EQUAL(2, instances.size());
- BOOST_CHECK_CLOSE(-0.30103 * M_LN10, instances[0].ln_backoff(0), 0.001);
- BOOST_CHECK_CLOSE(-0.30103 * M_LN10, instances[0].ln_backoff(1), 0.001);
-
- // Backoffs of <s> c
- BOOST_CHECK_CLOSE(0.0, instances[1].ln_backoff(0), 0.001);
- BOOST_CHECK_CLOSE((-0.30103 - 0.30103) * M_LN10, instances[1].ln_backoff(1), 0.001);
-
- // Three extensions: a, b, c
- BOOST_REQUIRE_EQUAL(3, instances[0].ln_extensions.rows());
- BOOST_REQUIRE_EQUAL(3, instances[0].extension_words.size());
-
- // <s> a
- BOOST_CHECK_CLOSE(-0.37712017 * M_LN10, instances[0].ln_extensions(FindRow(instances[0].extension_words, 2), 0), 0.001);
- // <s> c
- BOOST_CHECK_CLOSE((-0.90309 + -0.30103) * M_LN10, instances[0].ln_extensions(FindRow(instances[0].extension_words, 4), 0), 0.001);
- BOOST_CHECK_CLOSE(-0.4740302 * M_LN10, instances[0].ln_extensions(FindRow(instances[0].extension_words, 4), 1), 0.001);
-
- // <s> c </s>
- BOOST_CHECK_CLOSE(-0.09113217 * M_LN10, instances[1].ln_extensions(FindRow(instances[1].extension_words, 3), 1), 0.001);
-
- // p_0(c | <s>) = p_0(c)b_0(<s>) = 10^(-0.90309 + -0.30103)
- BOOST_CHECK_CLOSE((-0.90309 + -0.30103) * M_LN10, instances[0].ln_correct(0), 0.001);
- // p_1(c | <s>) = 10^-0.4740302
- BOOST_CHECK_CLOSE(-0.4740302 * M_LN10, instances[0].ln_correct(1), 0.001);
-}
-
-}}} // namespaces
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_main.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/tune_main.cc b/ext/kenlm/lm/interpolate/tune_main.cc
deleted file mode 100644
index 8296af1..0000000
--- a/ext/kenlm/lm/interpolate/tune_main.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-#include "lm/interpolate/tune_derivatives.hh"
-#include "lm/interpolate/tune_instance.hh"
-#include "util/file.hh"
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-local-typedefs"
-#include <Eigen/Dense>
-#pragma GCC diagnostic pop
-#include <boost/program_options.hpp>
-
-#include <cmath>
-#include <iostream>
-
-namespace lm { namespace interpolate {
-void TuneWeights(int tune_file, const std::vector<StringPiece> &model_names, Vector &weights) {
- util::FixedArray<Instance> instances;
- Matrix ln_unigrams;
- WordIndex bos = LoadInstances(tune_file, model_names, instances, ln_unigrams);
- ComputeDerivative derive(instances, ln_unigrams, bos);
- weights = Vector::Constant(model_names.size(), 1.0 / model_names.size());
- Vector gradient;
- Matrix hessian;
- for (std::size_t iteration = 0; iteration < 10 /*TODO fancy stopping criteria */; ++iteration) {
- std::cerr << "Iteration " << iteration << ": weights =";
- for (Vector::Index i = 0; i < weights.rows(); ++i) {
- std::cerr << ' ' << weights(i);
- }
- std::cerr << std::endl;
- std::cerr << "Perplexity = " <<
- derive.Iteration(weights, gradient, hessian)
- << std::endl;
- // TODO: 1.0 step size was too big and it kept getting unstable. More math.
- weights -= 0.7 * hessian.inverse() * gradient;
- }
-}
-}} // namespaces
-
-int main(int argc, char *argv[]) {
- Eigen::initParallel();
- namespace po = boost::program_options;
- // TODO help
- po::options_description options("Tuning options");
- std::string tuning_file;
- std::vector<std::string> input_models;
- options.add_options()
- ("tuning,t", po::value<std::string>(&tuning_file)->required(), "File to tune on. This should be a text file with one sentence per line.")
- ("model,m", po::value<std::vector<std::string> >(&input_models)->multitoken()->required(), "Models to interpolate");
- po::variables_map vm;
- po::store(po::parse_command_line(argc, argv, options), vm);
- po::notify(vm);
-
- std::vector<StringPiece> model_names;
- for (std::vector<std::string>::const_iterator i = input_models.begin(); i != input_models.end(); ++i) {
- model_names.push_back(*i);
- }
- lm::interpolate::Vector weights;
- lm::interpolate::TuneWeights(util::OpenReadOrThrow(tuning_file.c_str()), model_names, weights);
- std::cout << weights.transpose() << std::endl;
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_matrix.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/tune_matrix.hh b/ext/kenlm/lm/interpolate/tune_matrix.hh
deleted file mode 100644
index 7f1a0c9..0000000
--- a/ext/kenlm/lm/interpolate/tune_matrix.hh
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef LM_INTERPOLATE_TUNE_MATRIX_H
-#define LM_INTERPOLATE_TUNE_MATRIX_H
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-local-typedefs"
-#include <Eigen/Core>
-#pragma GCC diagnostic pop
-
-namespace lm { namespace interpolate {
-
-typedef Eigen::MatrixXd Matrix;
-typedef Eigen::VectorXd Vector;
-
-typedef Matrix::Scalar Accum;
-
-}} // namespaces
-#endif // LM_INTERPOLATE_TUNE_MATRIX_H
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/universal_vocab.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/universal_vocab.cc b/ext/kenlm/lm/interpolate/universal_vocab.cc
deleted file mode 100644
index 5cdf41e..0000000
--- a/ext/kenlm/lm/interpolate/universal_vocab.cc
+++ /dev/null
@@ -1,13 +0,0 @@
-#include "lm/interpolate/universal_vocab.hh"
-
-namespace lm {
-namespace interpolate {
-
-UniversalVocab::UniversalVocab(const std::vector<WordIndex>& model_vocab_sizes) {
- model_index_map_.resize(model_vocab_sizes.size());
- for (size_t i = 0; i < model_vocab_sizes.size(); ++i) {
- model_index_map_[i].resize(model_vocab_sizes[i]);
- }
-}
-
-}} // namespaces
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/universal_vocab.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/universal_vocab.hh b/ext/kenlm/lm/interpolate/universal_vocab.hh
deleted file mode 100644
index c720298..0000000
--- a/ext/kenlm/lm/interpolate/universal_vocab.hh
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef LM_INTERPOLATE_UNIVERSAL_VOCAB_H
-#define LM_INTERPOLATE_UNIVERSAL_VOCAB_H
-
-#include "lm/word_index.hh"
-
-#include <vector>
-#include <cstddef>
-
-namespace lm {
-namespace interpolate {
-
-class UniversalVocab {
-public:
- explicit UniversalVocab(const std::vector<WordIndex>& model_vocab_sizes);
-
- // GetUniversalIndex takes the model number and index for the specific
- // model and returns the universal model number
- WordIndex GetUniversalIdx(std::size_t model_num, WordIndex model_word_index) const {
- return model_index_map_[model_num][model_word_index];
- }
-
- const WordIndex *Mapping(std::size_t model) const {
- return &*model_index_map_[model].begin();
- }
-
- void InsertUniversalIdx(std::size_t model_num, WordIndex word_index,
- WordIndex universal_word_index) {
- model_index_map_[model_num][word_index] = universal_word_index;
- }
-
-private:
- std::vector<std::vector<WordIndex> > model_index_map_;
-};
-
-} // namespace interpolate
-} // namespace lm
-
-#endif // LM_INTERPOLATE_UNIVERSAL_VOCAB_H
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/kenlm_benchmark_main.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/kenlm_benchmark_main.cc b/ext/kenlm/lm/kenlm_benchmark_main.cc
deleted file mode 100644
index c9ee165..0000000
--- a/ext/kenlm/lm/kenlm_benchmark_main.cc
+++ /dev/null
@@ -1,142 +0,0 @@
-#include "lm/model.hh"
-#include "util/file_stream.hh"
-#include "util/file.hh"
-#include "util/file_piece.hh"
-#include "util/usage.hh"
-
-#include <stdint.h>
-
-namespace {
-
-template <class Model, class Width> void ConvertToBytes(const Model &model, int fd_in) {
- util::FilePiece in(fd_in);
- util::FileStream out(1);
- Width width;
- StringPiece word;
- const Width end_sentence = (Width)model.GetVocabulary().EndSentence();
- while (true) {
- while (in.ReadWordSameLine(word)) {
- width = (Width)model.GetVocabulary().Index(word);
- out.write(&width, sizeof(Width));
- }
- if (!in.ReadLineOrEOF(word)) break;
- out.write(&end_sentence, sizeof(Width));
- }
-}
-
-template <class Model, class Width> void QueryFromBytes(const Model &model, int fd_in) {
- lm::ngram::State state[3];
- const lm::ngram::State *const begin_state = &model.BeginSentenceState();
- const lm::ngram::State *next_state = begin_state;
- Width kEOS = model.GetVocabulary().EndSentence();
- Width buf[4096];
-
- uint64_t completed = 0;
- double loaded = util::CPUTime();
-
- std::cout << "CPU_to_load: " << loaded << std::endl;
-
- // Numerical precision: batch sums.
- double total = 0.0;
- while (std::size_t got = util::ReadOrEOF(fd_in, buf, sizeof(buf))) {
- float sum = 0.0;
- UTIL_THROW_IF2(got % sizeof(Width), "File size not a multiple of vocab id size " << sizeof(Width));
- got /= sizeof(Width);
- completed += got;
- // Do even stuff first.
- const Width *even_end = buf + (got & ~1);
- // Alternating states
- const Width *i;
- for (i = buf; i != even_end;) {
- sum += model.FullScore(*next_state, *i, state[1]).prob;
- next_state = (*i++ == kEOS) ? begin_state : &state[1];
- sum += model.FullScore(*next_state, *i, state[0]).prob;
- next_state = (*i++ == kEOS) ? begin_state : &state[0];
- }
- // Odd corner case.
- if (got & 1) {
- sum += model.FullScore(*next_state, *i, state[2]).prob;
- next_state = (*i++ == kEOS) ? begin_state : &state[2];
- }
- total += sum;
- }
- double after = util::CPUTime();
- std::cerr << "Probability sum is " << total << std::endl;
- std::cout << "Queries: " << completed << std::endl;
- std::cout << "CPU_excluding_load: " << (after - loaded) << "\nCPU_per_query: " << ((after - loaded) / static_cast<double>(completed)) << std::endl;
- std::cout << "RSSMax: " << util::RSSMax() << std::endl;
-}
-
-template <class Model, class Width> void DispatchFunction(const Model &model, bool query) {
- if (query) {
- QueryFromBytes<Model, Width>(model, 0);
- } else {
- ConvertToBytes<Model, Width>(model, 0);
- }
-}
-
-template <class Model> void DispatchWidth(const char *file, bool query) {
- lm::ngram::Config config;
- config.load_method = util::READ;
- std::cerr << "Using load_method = READ." << std::endl;
- Model model(file, config);
- lm::WordIndex bound = model.GetVocabulary().Bound();
- if (bound <= 256) {
- DispatchFunction<Model, uint8_t>(model, query);
- } else if (bound <= 65536) {
- DispatchFunction<Model, uint16_t>(model, query);
- } else if (bound <= (1ULL << 32)) {
- DispatchFunction<Model, uint32_t>(model, query);
- } else {
- DispatchFunction<Model, uint64_t>(model, query);
- }
-}
-
-void Dispatch(const char *file, bool query) {
- using namespace lm::ngram;
- lm::ngram::ModelType model_type;
- if (lm::ngram::RecognizeBinary(file, model_type)) {
- switch(model_type) {
- case PROBING:
- DispatchWidth<lm::ngram::ProbingModel>(file, query);
- break;
- case REST_PROBING:
- DispatchWidth<lm::ngram::RestProbingModel>(file, query);
- break;
- case TRIE:
- DispatchWidth<lm::ngram::TrieModel>(file, query);
- break;
- case QUANT_TRIE:
- DispatchWidth<lm::ngram::QuantTrieModel>(file, query);
- break;
- case ARRAY_TRIE:
- DispatchWidth<lm::ngram::ArrayTrieModel>(file, query);
- break;
- case QUANT_ARRAY_TRIE:
- DispatchWidth<lm::ngram::QuantArrayTrieModel>(file, query);
- break;
- default:
- UTIL_THROW(util::Exception, "Unrecognized kenlm model type " << model_type);
- }
- } else {
- UTIL_THROW(util::Exception, "Binarize before running benchmarks.");
- }
-}
-
-} // namespace
-
-int main(int argc, char *argv[]) {
- if (argc != 3 || (strcmp(argv[1], "vocab") && strcmp(argv[1], "query"))) {
- std::cerr
- << "Benchmark program for KenLM. Intended usage:\n"
- << "#Convert text to vocabulary ids offline. These ids are tied to a model.\n"
- << argv[0] << " vocab $model <$text >$text.vocab\n"
- << "#Ensure files are in RAM.\n"
- << "cat $text.vocab $model >/dev/null\n"
- << "#Timed query against the model.\n"
- << argv[0] << " query $model <$text.vocab\n";
- return 1;
- }
- Dispatch(argv[2], !strcmp(argv[1], "query"));
- return 0;
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/left.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/left.hh b/ext/kenlm/lm/left.hh
deleted file mode 100644
index 4d49686..0000000
--- a/ext/kenlm/lm/left.hh
+++ /dev/null
@@ -1,216 +0,0 @@
-/* Efficient left and right language model state for sentence fragments.
- * Intended usage:
- * Store ChartState with every chart entry.
- * To do a rule application:
- * 1. Make a ChartState object for your new entry.
- * 2. Construct RuleScore.
- * 3. Going from left to right, call Terminal or NonTerminal.
- * For terminals, just pass the vocab id.
- * For non-terminals, pass that non-terminal's ChartState.
- * If your decoder expects scores inclusive of subtree scores (i.e. you
- * label entries with the highest-scoring path), pass the non-terminal's
- * score as prob.
- * If your decoder expects relative scores and will walk the chart later,
- * pass prob = 0.0.
- * In other words, the only effect of prob is that it gets added to the
- * returned log probability.
- * 4. Call Finish. It returns the log probability.
- *
- * There's a couple more details:
- * Do not pass <s> to Terminal as it is formally not a word in the sentence,
- * only context. Instead, call BeginSentence. If called, it should be the
- * first call after RuleScore is constructed (since <s> is always the
- * leftmost).
- *
- * If the leftmost RHS is a non-terminal, it's faster to call BeginNonTerminal.
- *
- * Hashing and sorting comparison operators are provided. All state objects
- * are POD. If you intend to use memcmp on raw state objects, you must call
- * ZeroRemaining first, as the value of array entries beyond length is
- * otherwise undefined.
- *
- * Usage is of course not limited to chart decoding. Anything that generates
- * sentence fragments missing left context could benefit. For example, a
- * phrase-based decoder could pre-score phrases, storing ChartState with each
- * phrase, even if hypotheses are generated left-to-right.
- */
-
-#ifndef LM_LEFT_H
-#define LM_LEFT_H
-
-#include "lm/max_order.hh"
-#include "lm/state.hh"
-#include "lm/return.hh"
-
-#include "util/murmur_hash.hh"
-
-#include <algorithm>
-
-namespace lm {
-namespace ngram {
-
-template <class M> class RuleScore {
- public:
- explicit RuleScore(const M &model, ChartState &out) : model_(model), out_(&out), left_done_(false), prob_(0.0) {
- out.left.length = 0;
- out.right.length = 0;
- }
-
- void BeginSentence() {
- out_->right = model_.BeginSentenceState();
- // out_->left is empty.
- left_done_ = true;
- }
-
- void Terminal(WordIndex word) {
- State copy(out_->right);
- FullScoreReturn ret(model_.FullScore(copy, word, out_->right));
- if (left_done_) { prob_ += ret.prob; return; }
- if (ret.independent_left) {
- prob_ += ret.prob;
- left_done_ = true;
- return;
- }
- out_->left.pointers[out_->left.length++] = ret.extend_left;
- prob_ += ret.rest;
- if (out_->right.length != copy.length + 1)
- left_done_ = true;
- }
-
- // Faster version of NonTerminal for the case where the rule begins with a non-terminal.
- void BeginNonTerminal(const ChartState &in, float prob = 0.0) {
- prob_ = prob;
- *out_ = in;
- left_done_ = in.left.full;
- }
-
- void NonTerminal(const ChartState &in, float prob = 0.0) {
- prob_ += prob;
-
- if (!in.left.length) {
- if (in.left.full) {
- for (const float *i = out_->right.backoff; i < out_->right.backoff + out_->right.length; ++i) prob_ += *i;
- left_done_ = true;
- out_->right = in.right;
- }
- return;
- }
-
- if (!out_->right.length) {
- out_->right = in.right;
- if (left_done_) {
- prob_ += model_.UnRest(in.left.pointers, in.left.pointers + in.left.length, 1);
- return;
- }
- if (out_->left.length) {
- left_done_ = true;
- } else {
- out_->left = in.left;
- left_done_ = in.left.full;
- }
- return;
- }
-
- float backoffs[KENLM_MAX_ORDER - 1], backoffs2[KENLM_MAX_ORDER - 1];
- float *back = backoffs, *back2 = backoffs2;
- unsigned char next_use = out_->right.length;
-
- // First word
- if (ExtendLeft(in, next_use, 1, out_->right.backoff, back)) return;
-
- // Words after the first, so extending a bigram to begin with
- for (unsigned char extend_length = 2; extend_length <= in.left.length; ++extend_length) {
- if (ExtendLeft(in, next_use, extend_length, back, back2)) return;
- std::swap(back, back2);
- }
-
- if (in.left.full) {
- for (const float *i = back; i != back + next_use; ++i) prob_ += *i;
- left_done_ = true;
- out_->right = in.right;
- return;
- }
-
- // Right state was minimized, so it's already independent of the new words to the left.
- if (in.right.length < in.left.length) {
- out_->right = in.right;
- return;
- }
-
- // Shift exisiting words down.
- for (WordIndex *i = out_->right.words + next_use - 1; i >= out_->right.words; --i) {
- *(i + in.right.length) = *i;
- }
- // Add words from in.right.
- std::copy(in.right.words, in.right.words + in.right.length, out_->right.words);
- // Assemble backoff composed on the existing state's backoff followed by the new state's backoff.
- std::copy(in.right.backoff, in.right.backoff + in.right.length, out_->right.backoff);
- std::copy(back, back + next_use, out_->right.backoff + in.right.length);
- out_->right.length = in.right.length + next_use;
- }
-
- float Finish() {
- // A N-1-gram might extend left and right but we should still set full to true because it's an N-1-gram.
- out_->left.full = left_done_ || (out_->left.length == model_.Order() - 1);
- return prob_;
- }
-
- void Reset() {
- prob_ = 0.0;
- left_done_ = false;
- out_->left.length = 0;
- out_->right.length = 0;
- }
- void Reset(ChartState &replacement) {
- out_ = &replacement;
- Reset();
- }
-
- private:
- bool ExtendLeft(const ChartState &in, unsigned char &next_use, unsigned char extend_length, const float *back_in, float *back_out) {
- ProcessRet(model_.ExtendLeft(
- out_->right.words, out_->right.words + next_use, // Words to extend into
- back_in, // Backoffs to use
- in.left.pointers[extend_length - 1], extend_length, // Words to be extended
- back_out, // Backoffs for the next score
- next_use)); // Length of n-gram to use in next scoring.
- if (next_use != out_->right.length) {
- left_done_ = true;
- if (!next_use) {
- // Early exit.
- out_->right = in.right;
- prob_ += model_.UnRest(in.left.pointers + extend_length, in.left.pointers + in.left.length, extend_length + 1);
- return true;
- }
- }
- // Continue scoring.
- return false;
- }
-
- void ProcessRet(const FullScoreReturn &ret) {
- if (left_done_) {
- prob_ += ret.prob;
- return;
- }
- if (ret.independent_left) {
- prob_ += ret.prob;
- left_done_ = true;
- return;
- }
- out_->left.pointers[out_->left.length++] = ret.extend_left;
- prob_ += ret.rest;
- }
-
- const M &model_;
-
- ChartState *out_;
-
- bool left_done_;
-
- float prob_;
-};
-
-} // namespace ngram
-} // namespace lm
-
-#endif // LM_LEFT_H
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/left_test.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/left_test.cc b/ext/kenlm/lm/left_test.cc
deleted file mode 100644
index fdb6416..0000000
--- a/ext/kenlm/lm/left_test.cc
+++ /dev/null
@@ -1,397 +0,0 @@
-#include "lm/left.hh"
-#include "lm/model.hh"
-
-#include "util/tokenize_piece.hh"
-
-#include <vector>
-
-#define BOOST_TEST_MODULE LeftTest
-#include <boost/test/unit_test.hpp>
-#include <boost/test/floating_point_comparison.hpp>
-
-namespace lm {
-namespace ngram {
-namespace {
-
-#define Term(word) score.Terminal(m.GetVocabulary().Index(word));
-#define VCheck(word, value) BOOST_CHECK_EQUAL(m.GetVocabulary().Index(word), value);
-
-// Apparently some Boost versions use templates and are pretty strict about types matching.
-#define SLOPPY_CHECK_CLOSE(ref, value, tol) BOOST_CHECK_CLOSE(static_cast<double>(ref), static_cast<double>(value), static_cast<double>(tol));
-
-template <class M> void Short(const M &m) {
- ChartState base;
- {
- RuleScore<M> score(m, base);
- Term("more");
- Term("loin");
- SLOPPY_CHECK_CLOSE(-1.206319 - 0.3561665, score.Finish(), 0.001);
- }
- BOOST_CHECK(base.left.full);
- BOOST_CHECK_EQUAL(2, base.left.length);
- BOOST_CHECK_EQUAL(1, base.right.length);
- VCheck("loin", base.right.words[0]);
-
- ChartState more_left;
- {
- RuleScore<M> score(m, more_left);
- Term("little");
- score.NonTerminal(base, -1.206319 - 0.3561665);
- // p(little more loin | null context)
- SLOPPY_CHECK_CLOSE(-1.56538, score.Finish(), 0.001);
- }
- BOOST_CHECK_EQUAL(3, more_left.left.length);
- BOOST_CHECK_EQUAL(1, more_left.right.length);
- VCheck("loin", more_left.right.words[0]);
- BOOST_CHECK(more_left.left.full);
-
- ChartState shorter;
- {
- RuleScore<M> score(m, shorter);
- Term("to");
- score.NonTerminal(base, -1.206319 - 0.3561665);
- SLOPPY_CHECK_CLOSE(-0.30103 - 1.687872 - 1.206319 - 0.3561665, score.Finish(), 0.01);
- }
- BOOST_CHECK_EQUAL(1, shorter.left.length);
- BOOST_CHECK_EQUAL(1, shorter.right.length);
- VCheck("loin", shorter.right.words[0]);
- BOOST_CHECK(shorter.left.full);
-}
-
-template <class M> void Charge(const M &m) {
- ChartState base;
- {
- RuleScore<M> score(m, base);
- Term("on");
- Term("more");
- SLOPPY_CHECK_CLOSE(-1.509559 -0.4771212 -1.206319, score.Finish(), 0.001);
- }
- BOOST_CHECK_EQUAL(1, base.left.length);
- BOOST_CHECK_EQUAL(1, base.right.length);
- VCheck("more", base.right.words[0]);
- BOOST_CHECK(base.left.full);
-
- ChartState extend;
- {
- RuleScore<M> score(m, extend);
- Term("looking");
- score.NonTerminal(base, -1.509559 -0.4771212 -1.206319);
- SLOPPY_CHECK_CLOSE(-3.91039, score.Finish(), 0.001);
- }
- BOOST_CHECK_EQUAL(2, extend.left.length);
- BOOST_CHECK_EQUAL(1, extend.right.length);
- VCheck("more", extend.right.words[0]);
- BOOST_CHECK(extend.left.full);
-
- ChartState tobos;
- {
- RuleScore<M> score(m, tobos);
- score.BeginSentence();
- score.NonTerminal(extend, -3.91039);
- SLOPPY_CHECK_CLOSE(-3.471169, score.Finish(), 0.001);
- }
- BOOST_CHECK_EQUAL(0, tobos.left.length);
- BOOST_CHECK_EQUAL(1, tobos.right.length);
-}
-
-template <class M> float LeftToRight(const M &m, const std::vector<WordIndex> &words, bool begin_sentence = false) {
- float ret = 0.0;
- State right = begin_sentence ? m.BeginSentenceState() : m.NullContextState();
- for (std::vector<WordIndex>::const_iterator i = words.begin(); i != words.end(); ++i) {
- State copy(right);
- ret += m.Score(copy, *i, right);
- }
- return ret;
-}
-
-template <class M> float RightToLeft(const M &m, const std::vector<WordIndex> &words, bool begin_sentence = false) {
- float ret = 0.0;
- ChartState state;
- state.left.length = 0;
- state.right.length = 0;
- state.left.full = false;
- for (std::vector<WordIndex>::const_reverse_iterator i = words.rbegin(); i != words.rend(); ++i) {
- ChartState copy(state);
- RuleScore<M> score(m, state);
- score.Terminal(*i);
- score.NonTerminal(copy, ret);
- ret = score.Finish();
- }
- if (begin_sentence) {
- ChartState copy(state);
- RuleScore<M> score(m, state);
- score.BeginSentence();
- score.NonTerminal(copy, ret);
- ret = score.Finish();
- }
- return ret;
-}
-
-template <class M> float TreeMiddle(const M &m, const std::vector<WordIndex> &words, bool begin_sentence = false) {
- std::vector<std::pair<ChartState, float> > states(words.size());
- for (unsigned int i = 0; i < words.size(); ++i) {
- RuleScore<M> score(m, states[i].first);
- score.Terminal(words[i]);
- states[i].second = score.Finish();
- }
- while (states.size() > 1) {
- std::vector<std::pair<ChartState, float> > upper((states.size() + 1) / 2);
- for (unsigned int i = 0; i < states.size() / 2; ++i) {
- RuleScore<M> score(m, upper[i].first);
- score.NonTerminal(states[i*2].first, states[i*2].second);
- score.NonTerminal(states[i*2+1].first, states[i*2+1].second);
- upper[i].second = score.Finish();
- }
- if (states.size() % 2) {
- upper.back() = states.back();
- }
- std::swap(states, upper);
- }
-
- if (states.empty()) return 0.0;
-
- if (begin_sentence) {
- ChartState ignored;
- RuleScore<M> score(m, ignored);
- score.BeginSentence();
- score.NonTerminal(states.front().first, states.front().second);
- return score.Finish();
- } else {
- return states.front().second;
- }
-
-}
-
-template <class M> void LookupVocab(const M &m, const StringPiece &str, std::vector<WordIndex> &out) {
- out.clear();
- for (util::TokenIter<util::SingleCharacter, true> i(str, ' '); i; ++i) {
- out.push_back(m.GetVocabulary().Index(*i));
- }
-}
-
-#define TEXT_TEST(str) \
- LookupVocab(m, str, words); \
- expect = LeftToRight(m, words, rest); \
- SLOPPY_CHECK_CLOSE(expect, RightToLeft(m, words, rest), 0.001); \
- SLOPPY_CHECK_CLOSE(expect, TreeMiddle(m, words, rest), 0.001); \
-
-// Build sentences, or parts thereof, from right to left.
-template <class M> void GrowBig(const M &m, bool rest = false) {
- std::vector<WordIndex> words;
- float expect;
- TEXT_TEST("in biarritz watching considering looking . on a little more loin also would consider higher to look good unknown the screening foo bar , unknown however unknown </s>");
- TEXT_TEST("on a little more loin also would consider higher to look good unknown the screening foo bar , unknown however unknown </s>");
- TEXT_TEST("on a little more loin also would consider higher to look good");
- TEXT_TEST("more loin also would consider higher to look good");
- TEXT_TEST("more loin also would consider higher to look");
- TEXT_TEST("also would consider higher to look");
- TEXT_TEST("also would consider higher");
- TEXT_TEST("would consider higher to look");
- TEXT_TEST("consider higher to look");
- TEXT_TEST("consider higher to");
- TEXT_TEST("consider higher");
-}
-
-template <class M> void GrowSmall(const M &m, bool rest = false) {
- std::vector<WordIndex> words;
- float expect;
- TEXT_TEST("in biarritz watching considering looking . </s>");
- TEXT_TEST("in biarritz watching considering looking .");
- TEXT_TEST("in biarritz");
-}
-
-template <class M> void AlsoWouldConsiderHigher(const M &m) {
- ChartState also;
- {
- RuleScore<M> score(m, also);
- score.Terminal(m.GetVocabulary().Index("also"));
- SLOPPY_CHECK_CLOSE(-1.687872, score.Finish(), 0.001);
- }
- ChartState would;
- {
- RuleScore<M> score(m, would);
- score.Terminal(m.GetVocabulary().Index("would"));
- SLOPPY_CHECK_CLOSE(-1.687872, score.Finish(), 0.001);
- }
- ChartState combine_also_would;
- {
- RuleScore<M> score(m, combine_also_would);
- score.NonTerminal(also, -1.687872);
- score.NonTerminal(would, -1.687872);
- SLOPPY_CHECK_CLOSE(-1.687872 - 2.0, score.Finish(), 0.001);
- }
- BOOST_CHECK_EQUAL(2, combine_also_would.right.length);
-
- ChartState also_would;
- {
- RuleScore<M> score(m, also_would);
- score.Terminal(m.GetVocabulary().Index("also"));
- score.Terminal(m.GetVocabulary().Index("would"));
- SLOPPY_CHECK_CLOSE(-1.687872 - 2.0, score.Finish(), 0.001);
- }
- BOOST_CHECK_EQUAL(2, also_would.right.length);
-
- ChartState consider;
- {
- RuleScore<M> score(m, consider);
- score.Terminal(m.GetVocabulary().Index("consider"));
- SLOPPY_CHECK_CLOSE(-1.687872, score.Finish(), 0.001);
- }
- BOOST_CHECK_EQUAL(1, consider.left.length);
- BOOST_CHECK_EQUAL(1, consider.right.length);
- BOOST_CHECK(!consider.left.full);
-
- ChartState higher;
- float higher_score;
- {
- RuleScore<M> score(m, higher);
- score.Terminal(m.GetVocabulary().Index("higher"));
- higher_score = score.Finish();
- }
- SLOPPY_CHECK_CLOSE(-1.509559, higher_score, 0.001);
- BOOST_CHECK_EQUAL(1, higher.left.length);
- BOOST_CHECK_EQUAL(1, higher.right.length);
- BOOST_CHECK(!higher.left.full);
- VCheck("higher", higher.right.words[0]);
- SLOPPY_CHECK_CLOSE(-0.30103, higher.right.backoff[0], 0.001);
-
- ChartState consider_higher;
- {
- RuleScore<M> score(m, consider_higher);
- score.NonTerminal(consider, -1.687872);
- score.NonTerminal(higher, higher_score);
- SLOPPY_CHECK_CLOSE(-1.509559 - 1.687872 - 0.30103, score.Finish(), 0.001);
- }
- BOOST_CHECK_EQUAL(2, consider_higher.left.length);
- BOOST_CHECK(!consider_higher.left.full);
-
- ChartState full;
- {
- RuleScore<M> score(m, full);
- score.NonTerminal(combine_also_would, -1.687872 - 2.0);
- score.NonTerminal(consider_higher, -1.509559 - 1.687872 - 0.30103);
- SLOPPY_CHECK_CLOSE(-10.6879, score.Finish(), 0.001);
- }
- BOOST_CHECK_EQUAL(4, full.right.length);
-}
-
-#define CHECK_SCORE(str, val) \
-{ \
- float got = val; \
- std::vector<WordIndex> indices; \
- LookupVocab(m, str, indices); \
- SLOPPY_CHECK_CLOSE(LeftToRight(m, indices), got, 0.001); \
-}
-
-template <class M> void FullGrow(const M &m) {
- std::vector<WordIndex> words;
- LookupVocab(m, "in biarritz watching considering looking . </s>", words);
-
- ChartState lexical[7];
- float lexical_scores[7];
- for (unsigned int i = 0; i < 7; ++i) {
- RuleScore<M> score(m, lexical[i]);
- score.Terminal(words[i]);
- lexical_scores[i] = score.Finish();
- }
- CHECK_SCORE("in", lexical_scores[0]);
- CHECK_SCORE("biarritz", lexical_scores[1]);
- CHECK_SCORE("watching", lexical_scores[2]);
- CHECK_SCORE("</s>", lexical_scores[6]);
-
- ChartState l1[4];
- float l1_scores[4];
- {
- RuleScore<M> score(m, l1[0]);
- score.NonTerminal(lexical[0], lexical_scores[0]);
- score.NonTerminal(lexical[1], lexical_scores[1]);
- CHECK_SCORE("in biarritz", l1_scores[0] = score.Finish());
- }
- {
- RuleScore<M> score(m, l1[1]);
- score.NonTerminal(lexical[2], lexical_scores[2]);
- score.NonTerminal(lexical[3], lexical_scores[3]);
- CHECK_SCORE("watching considering", l1_scores[1] = score.Finish());
- }
- {
- RuleScore<M> score(m, l1[2]);
- score.NonTerminal(lexical[4], lexical_scores[4]);
- score.NonTerminal(lexical[5], lexical_scores[5]);
- CHECK_SCORE("looking .", l1_scores[2] = score.Finish());
- }
- BOOST_CHECK_EQUAL(l1[2].left.length, 1);
- l1[3] = lexical[6];
- l1_scores[3] = lexical_scores[6];
-
- ChartState l2[2];
- float l2_scores[2];
- {
- RuleScore<M> score(m, l2[0]);
- score.NonTerminal(l1[0], l1_scores[0]);
- score.NonTerminal(l1[1], l1_scores[1]);
- CHECK_SCORE("in biarritz watching considering", l2_scores[0] = score.Finish());
- }
- {
- RuleScore<M> score(m, l2[1]);
- score.NonTerminal(l1[2], l1_scores[2]);
- score.NonTerminal(l1[3], l1_scores[3]);
- CHECK_SCORE("looking . </s>", l2_scores[1] = score.Finish());
- }
- BOOST_CHECK_EQUAL(l2[1].left.length, 1);
- BOOST_CHECK(l2[1].left.full);
-
- ChartState top;
- {
- RuleScore<M> score(m, top);
- score.NonTerminal(l2[0], l2_scores[0]);
- score.NonTerminal(l2[1], l2_scores[1]);
- CHECK_SCORE("in biarritz watching considering looking . </s>", score.Finish());
- }
-}
-
-const char *FileLocation() {
- if (boost::unit_test::framework::master_test_suite().argc < 2) {
- return "test.arpa";
- }
- return boost::unit_test::framework::master_test_suite().argv[1];
-}
-
-template <class M> void Everything() {
- Config config;
- config.messages = NULL;
- M m(FileLocation(), config);
-
- Short(m);
- Charge(m);
- GrowBig(m);
- AlsoWouldConsiderHigher(m);
- GrowSmall(m);
- FullGrow(m);
-}
-
-BOOST_AUTO_TEST_CASE(ProbingAll) {
- Everything<Model>();
-}
-BOOST_AUTO_TEST_CASE(TrieAll) {
- Everything<TrieModel>();
-}
-BOOST_AUTO_TEST_CASE(QuantTrieAll) {
- Everything<QuantTrieModel>();
-}
-BOOST_AUTO_TEST_CASE(ArrayQuantTrieAll) {
- Everything<QuantArrayTrieModel>();
-}
-BOOST_AUTO_TEST_CASE(ArrayTrieAll) {
- Everything<ArrayTrieModel>();
-}
-
-BOOST_AUTO_TEST_CASE(RestProbing) {
- Config config;
- config.messages = NULL;
- RestProbingModel m(FileLocation(), config);
- GrowBig(m, true);
-}
-
-} // namespace
-} // namespace ngram
-} // namespace lm
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/lm_exception.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/lm_exception.cc b/ext/kenlm/lm/lm_exception.cc
deleted file mode 100644
index 58d468f..0000000
--- a/ext/kenlm/lm/lm_exception.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-#include "lm/lm_exception.hh"
-
-#include <cerrno>
-#include <cstdio>
-
-namespace lm {
-
-ConfigException::ConfigException() throw() {}
-ConfigException::~ConfigException() throw() {}
-
-LoadException::LoadException() throw() {}
-LoadException::~LoadException() throw() {}
-
-FormatLoadException::FormatLoadException() throw() {}
-FormatLoadException::~FormatLoadException() throw() {}
-
-VocabLoadException::VocabLoadException() throw() {}
-VocabLoadException::~VocabLoadException() throw() {}
-
-SpecialWordMissingException::SpecialWordMissingException() throw() {}
-SpecialWordMissingException::~SpecialWordMissingException() throw() {}
-
-} // namespace lm
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/lm_exception.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/lm_exception.hh b/ext/kenlm/lm/lm_exception.hh
deleted file mode 100644
index 85a5738..0000000
--- a/ext/kenlm/lm/lm_exception.hh
+++ /dev/null
@@ -1,50 +0,0 @@
-#ifndef LM_LM_EXCEPTION_H
-#define LM_LM_EXCEPTION_H
-
-// Named to avoid conflict with util/exception.hh.
-
-#include "util/exception.hh"
-#include "util/string_piece.hh"
-
-#include <exception>
-#include <string>
-
-namespace lm {
-
-typedef enum {THROW_UP, COMPLAIN, SILENT} WarningAction;
-
-class ConfigException : public util::Exception {
- public:
- ConfigException() throw();
- ~ConfigException() throw();
-};
-
-class LoadException : public util::Exception {
- public:
- virtual ~LoadException() throw();
-
- protected:
- LoadException() throw();
-};
-
-class FormatLoadException : public LoadException {
- public:
- FormatLoadException() throw();
- ~FormatLoadException() throw();
-};
-
-class VocabLoadException : public LoadException {
- public:
- virtual ~VocabLoadException() throw();
- VocabLoadException() throw();
-};
-
-class SpecialWordMissingException : public VocabLoadException {
- public:
- explicit SpecialWordMissingException() throw();
- ~SpecialWordMissingException() throw();
-};
-
-} // namespace lm
-
-#endif // LM_LM_EXCEPTION
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/max_order.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/max_order.hh b/ext/kenlm/lm/max_order.hh
deleted file mode 100644
index 0ad1379..0000000
--- a/ext/kenlm/lm/max_order.hh
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef LM_MAX_ORDER_H
-#define LM_MAX_ORDER_H
-/* IF YOUR BUILD SYSTEM PASSES -DKENLM_MAX_ORDER, THEN CHANGE THE BUILD SYSTEM.
- * If not, this is the default maximum order.
- * Having this limit means that State can be
- * (kMaxOrder - 1) * sizeof(float) bytes instead of
- * sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead
- */
-#ifndef KENLM_ORDER_MESSAGE
-#define KENLM_ORDER_MESSAGE "If your build system supports changing KENLM_MAX_ORDER, change it there and recompile. In the KenLM tarball or Moses, use e.g. `bjam --max-kenlm-order=6 -a'. Otherwise, edit lm/max_order.hh."
-#endif
-
-#endif // LM_MAX_ORDER_H
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/model.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/model.cc b/ext/kenlm/lm/model.cc
deleted file mode 100644
index a5a16bf..0000000
--- a/ext/kenlm/lm/model.cc
+++ /dev/null
@@ -1,349 +0,0 @@
-#include "lm/model.hh"
-
-#include "lm/blank.hh"
-#include "lm/lm_exception.hh"
-#include "lm/search_hashed.hh"
-#include "lm/search_trie.hh"
-#include "lm/read_arpa.hh"
-#include "util/have.hh"
-#include "util/murmur_hash.hh"
-
-#include <algorithm>
-#include <functional>
-#include <numeric>
-#include <cmath>
-#include <limits>
-
-namespace lm {
-namespace ngram {
-namespace detail {
-
-template <class Search, class VocabularyT> const ModelType GenericModel<Search, VocabularyT>::kModelType = Search::kModelType;
-
-template <class Search, class VocabularyT> uint64_t GenericModel<Search, VocabularyT>::Size(const std::vector<uint64_t> &counts, const Config &config) {
- return VocabularyT::Size(counts[0], config) + Search::Size(counts, config);
-}
-
-template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::SetupMemory(void *base, const std::vector<uint64_t> &counts, const Config &config) {
- size_t goal_size = util::CheckOverflow(Size(counts, config));
- uint8_t *start = static_cast<uint8_t*>(base);
- size_t allocated = VocabularyT::Size(counts[0], config);
- vocab_.SetupMemory(start, allocated, counts[0], config);
- start += allocated;
- start = search_.SetupMemory(start, counts, config);
- if (static_cast<std::size_t>(start - static_cast<uint8_t*>(base)) != goal_size) UTIL_THROW(FormatLoadException, "The data structures took " << (start - static_cast<uint8_t*>(base)) << " but Size says they should take " << goal_size);
-}
-
-namespace {
-void ComplainAboutARPA(const Config &config, ModelType model_type) {
- if (config.write_mmap || !config.messages) return;
- if (config.arpa_complain == Config::ALL) {
- *config.messages << "Loading the LM will be faster if you build a binary file." << std::endl;
- } else if (config.arpa_complain == Config::EXPENSIVE &&
- (model_type == TRIE || model_type == QUANT_TRIE || model_type == ARRAY_TRIE || model_type == QUANT_ARRAY_TRIE)) {
- *config.messages << "Building " << kModelNames[model_type] << " from ARPA is expensive. Save time by building a binary format." << std::endl;
- }
-}
-
-void CheckCounts(const std::vector<uint64_t> &counts) {
- UTIL_THROW_IF(counts.size() > KENLM_MAX_ORDER, FormatLoadException, "This model has order " << counts.size() << " but KenLM was compiled to support up to " << KENLM_MAX_ORDER << ". " << KENLM_ORDER_MESSAGE);
- if (sizeof(uint64_t) > sizeof(std::size_t)) {
- for (std::vector<uint64_t>::const_iterator i = counts.begin(); i != counts.end(); ++i) {
- UTIL_THROW_IF(*i > static_cast<uint64_t>(std::numeric_limits<size_t>::max()), util::OverflowException, "This model has " << *i << " " << (i - counts.begin() + 1) << "-grams which is too many for 32-bit machines.");
- }
- }
-}
-
-} // namespace
-
-template <class Search, class VocabularyT> GenericModel<Search, VocabularyT>::GenericModel(const char *file, const Config &init_config) : backing_(init_config) {
- util::scoped_fd fd(util::OpenReadOrThrow(file));
- if (IsBinaryFormat(fd.get())) {
- Parameters parameters;
- int fd_shallow = fd.release();
- backing_.InitializeBinary(fd_shallow, kModelType, kVersion, parameters);
- CheckCounts(parameters.counts);
-
- Config new_config(init_config);
- new_config.probing_multiplier = parameters.fixed.probing_multiplier;
- Search::UpdateConfigFromBinary(backing_, parameters.counts, VocabularyT::Size(parameters.counts[0], new_config), new_config);
- UTIL_THROW_IF(new_config.enumerate_vocab && !parameters.fixed.has_vocabulary, FormatLoadException, "The decoder requested all the vocabulary strings, but this binary file does not have them. You may need to rebuild the binary file with an updated version of build_binary.");
-
- SetupMemory(backing_.LoadBinary(Size(parameters.counts, new_config)), parameters.counts, new_config);
- vocab_.LoadedBinary(parameters.fixed.has_vocabulary, fd_shallow, new_config.enumerate_vocab, backing_.VocabStringReadingOffset());
- } else {
- ComplainAboutARPA(init_config, kModelType);
- InitializeFromARPA(fd.release(), file, init_config);
- }
-
- // g++ prints warnings unless these are fully initialized.
- State begin_sentence = State();
- begin_sentence.length = 1;
- begin_sentence.words[0] = vocab_.BeginSentence();
- typename Search::Node ignored_node;
- bool ignored_independent_left;
- uint64_t ignored_extend_left;
- begin_sentence.backoff[0] = search_.LookupUnigram(begin_sentence.words[0], ignored_node, ignored_independent_left, ignored_extend_left).Backoff();
- State null_context = State();
- null_context.length = 0;
- P::Init(begin_sentence, null_context, vocab_, search_.Order());
-}
-
-template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::InitializeFromARPA(int fd, const char *file, const Config &config) {
- // Backing file is the ARPA.
- util::FilePiece f(fd, file, config.ProgressMessages());
- try {
- std::vector<uint64_t> counts;
- // File counts do not include pruned trigrams that extend to quadgrams etc. These will be fixed by search_.
- ReadARPACounts(f, counts);
- CheckCounts(counts);
- if (counts.size() < 2) UTIL_THROW(FormatLoadException, "This ngram implementation assumes at least a bigram model.");
- if (config.probing_multiplier <= 1.0) UTIL_THROW(ConfigException, "probing multiplier must be > 1.0");
-
- std::size_t vocab_size = util::CheckOverflow(VocabularyT::Size(counts[0], config));
- // Setup the binary file for writing the vocab lookup table. The search_ is responsible for growing the binary file to its needs.
- vocab_.SetupMemory(backing_.SetupJustVocab(vocab_size, counts.size()), vocab_size, counts[0], config);
-
- if (config.write_mmap && config.include_vocab) {
- WriteWordsWrapper wrap(config.enumerate_vocab);
- vocab_.ConfigureEnumerate(&wrap, counts[0]);
- search_.InitializeFromARPA(file, f, counts, config, vocab_, backing_);
- void *vocab_rebase, *search_rebase;
- backing_.WriteVocabWords(wrap.Buffer(), vocab_rebase, search_rebase);
- // Due to writing at the end of file, mmap may have relocated data. So remap.
- vocab_.Relocate(vocab_rebase);
- search_.SetupMemory(reinterpret_cast<uint8_t*>(search_rebase), counts, config);
- } else {
- vocab_.ConfigureEnumerate(config.enumerate_vocab, counts[0]);
- search_.InitializeFromARPA(file, f, counts, config, vocab_, backing_);
- }
-
- if (!vocab_.SawUnk()) {
- assert(config.unknown_missing != THROW_UP);
- // Default probabilities for unknown.
- search_.UnknownUnigram().backoff = 0.0;
- search_.UnknownUnigram().prob = config.unknown_missing_logprob;
- }
- backing_.FinishFile(config, kModelType, kVersion, counts);
- } catch (util::Exception &e) {
- e << " Byte: " << f.Offset();
- throw;
- }
-}
-
-template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::FullScore(const State &in_state, const WordIndex new_word, State &out_state) const {
- FullScoreReturn ret = ScoreExceptBackoff(in_state.words, in_state.words + in_state.length, new_word, out_state);
- for (const float *i = in_state.backoff + ret.ngram_length - 1; i < in_state.backoff + in_state.length; ++i) {
- ret.prob += *i;
- }
- return ret;
-}
-
-template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const {
- context_rend = std::min(context_rend, context_rbegin + P::Order() - 1);
- FullScoreReturn ret = ScoreExceptBackoff(context_rbegin, context_rend, new_word, out_state);
-
- // Add the backoff weights for n-grams of order start to (context_rend - context_rbegin).
- unsigned char start = ret.ngram_length;
- if (context_rend - context_rbegin < static_cast<std::ptrdiff_t>(start)) return ret;
-
- bool independent_left;
- uint64_t extend_left;
- typename Search::Node node;
- if (start <= 1) {
- ret.prob += search_.LookupUnigram(*context_rbegin, node, independent_left, extend_left).Backoff();
- start = 2;
- } else if (!search_.FastMakeNode(context_rbegin, context_rbegin + start - 1, node)) {
- return ret;
- }
- // i is the order of the backoff we're looking for.
- unsigned char order_minus_2 = start - 2;
- for (const WordIndex *i = context_rbegin + start - 1; i < context_rend; ++i, ++order_minus_2) {
- typename Search::MiddlePointer p(search_.LookupMiddle(order_minus_2, *i, node, independent_left, extend_left));
- if (!p.Found()) break;
- ret.prob += p.Backoff();
- }
- return ret;
-}
-
-template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const {
- // Generate a state from context.
- context_rend = std::min(context_rend, context_rbegin + P::Order() - 1);
- if (context_rend == context_rbegin) {
- out_state.length = 0;
- return;
- }
- typename Search::Node node;
- bool independent_left;
- uint64_t extend_left;
- out_state.backoff[0] = search_.LookupUnigram(*context_rbegin, node, independent_left, extend_left).Backoff();
- out_state.length = HasExtension(out_state.backoff[0]) ? 1 : 0;
- float *backoff_out = out_state.backoff + 1;
- unsigned char order_minus_2 = 0;
- for (const WordIndex *i = context_rbegin + 1; i < context_rend; ++i, ++backoff_out, ++order_minus_2) {
- typename Search::MiddlePointer p(search_.LookupMiddle(order_minus_2, *i, node, independent_left, extend_left));
- if (!p.Found()) {
- std::copy(context_rbegin, context_rbegin + out_state.length, out_state.words);
- return;
- }
- *backoff_out = p.Backoff();
- if (HasExtension(*backoff_out)) out_state.length = i - context_rbegin + 1;
- }
- std::copy(context_rbegin, context_rbegin + out_state.length, out_state.words);
-}
-
-template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::ExtendLeft(
- const WordIndex *add_rbegin, const WordIndex *add_rend,
- const float *backoff_in,
- uint64_t extend_pointer,
- unsigned char extend_length,
- float *backoff_out,
- unsigned char &next_use) const {
- FullScoreReturn ret;
- typename Search::Node node;
- if (extend_length == 1) {
- typename Search::UnigramPointer ptr(search_.LookupUnigram(static_cast<WordIndex>(extend_pointer), node, ret.independent_left, ret.extend_left));
- ret.rest = ptr.Rest();
- ret.prob = ptr.Prob();
- assert(!ret.independent_left);
- } else {
- typename Search::MiddlePointer ptr(search_.Unpack(extend_pointer, extend_length, node));
- ret.rest = ptr.Rest();
- ret.prob = ptr.Prob();
- ret.extend_left = extend_pointer;
- // If this function is called, then it does depend on left words.
- ret.independent_left = false;
- }
- float subtract_me = ret.rest;
- ret.ngram_length = extend_length;
- next_use = extend_length;
- ResumeScore(add_rbegin, add_rend, extend_length - 1, node, backoff_out, next_use, ret);
- next_use -= extend_length;
- // Charge backoffs.
- for (const float *b = backoff_in + ret.ngram_length - extend_length; b < backoff_in + (add_rend - add_rbegin); ++b) ret.prob += *b;
- ret.prob -= subtract_me;
- ret.rest -= subtract_me;
- return ret;
-}
-
-namespace {
-// Do a paraonoid copy of history, assuming new_word has already been copied
-// (hence the -1). out_state.length could be zero so I avoided using
-// std::copy.
-void CopyRemainingHistory(const WordIndex *from, State &out_state) {
- WordIndex *out = out_state.words + 1;
- const WordIndex *in_end = from + static_cast<ptrdiff_t>(out_state.length) - 1;
- for (const WordIndex *in = from; in < in_end; ++in, ++out) *out = *in;
-}
-} // namespace
-
-/* Ugly optimized function. Produce a score excluding backoff.
- * The search goes in increasing order of ngram length.
- * Context goes backward, so context_begin is the word immediately preceeding
- * new_word.
- */
-template <class Search, class VocabularyT> FullScoreReturn GenericModel<Search, VocabularyT>::ScoreExceptBackoff(
- const WordIndex *const context_rbegin,
- const WordIndex *const context_rend,
- const WordIndex new_word,
- State &out_state) const {
- assert(new_word < vocab_.Bound());
- FullScoreReturn ret;
- // ret.ngram_length contains the last known non-blank ngram length.
- ret.ngram_length = 1;
-
- typename Search::Node node;
- typename Search::UnigramPointer uni(search_.LookupUnigram(new_word, node, ret.independent_left, ret.extend_left));
- out_state.backoff[0] = uni.Backoff();
- ret.prob = uni.Prob();
- ret.rest = uni.Rest();
-
- // This is the length of the context that should be used for continuation to the right.
- out_state.length = HasExtension(out_state.backoff[0]) ? 1 : 0;
- // We'll write the word anyway since it will probably be used and does no harm being there.
- out_state.words[0] = new_word;
- if (context_rbegin == context_rend) return ret;
-
- ResumeScore(context_rbegin, context_rend, 0, node, out_state.backoff + 1, out_state.length, ret);
- CopyRemainingHistory(context_rbegin, out_state);
- return ret;
-}
-
-template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT>::ResumeScore(const WordIndex *hist_iter, const WordIndex *const context_rend, unsigned char order_minus_2, typename Search::Node &node, float *backoff_out, unsigned char &next_use, FullScoreReturn &ret) const {
- for (; ; ++order_minus_2, ++hist_iter, ++backoff_out) {
- if (hist_iter == context_rend) return;
- if (ret.independent_left) return;
- if (order_minus_2 == P::Order() - 2) break;
-
- typename Search::MiddlePointer pointer(search_.LookupMiddle(order_minus_2, *hist_iter, node, ret.independent_left, ret.extend_left));
- if (!pointer.Found()) return;
- *backoff_out = pointer.Backoff();
- ret.prob = pointer.Prob();
- ret.rest = pointer.Rest();
- ret.ngram_length = order_minus_2 + 2;
- if (HasExtension(*backoff_out)) {
- next_use = ret.ngram_length;
- }
- }
- ret.independent_left = true;
- typename Search::LongestPointer longest(search_.LookupLongest(*hist_iter, node));
- if (longest.Found()) {
- ret.prob = longest.Prob();
- ret.rest = ret.prob;
- // There is no blank in longest_.
- ret.ngram_length = P::Order();
- }
-}
-
-template <class Search, class VocabularyT> float GenericModel<Search, VocabularyT>::InternalUnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const {
- float ret;
- typename Search::Node node;
- if (first_length == 1) {
- if (pointers_begin >= pointers_end) return 0.0;
- bool independent_left;
- uint64_t extend_left;
- typename Search::UnigramPointer ptr(search_.LookupUnigram(static_cast<WordIndex>(*pointers_begin), node, independent_left, extend_left));
- ret = ptr.Prob() - ptr.Rest();
- ++first_length;
- ++pointers_begin;
- } else {
- ret = 0.0;
- }
- for (const uint64_t *i = pointers_begin; i < pointers_end; ++i, ++first_length) {
- typename Search::MiddlePointer ptr(search_.Unpack(*i, first_length, node));
- ret += ptr.Prob() - ptr.Rest();
- }
- return ret;
-}
-
-template class GenericModel<HashedSearch<BackoffValue>, ProbingVocabulary>;
-template class GenericModel<HashedSearch<RestValue>, ProbingVocabulary>;
-template class GenericModel<trie::TrieSearch<DontQuantize, trie::DontBhiksha>, SortedVocabulary>;
-template class GenericModel<trie::TrieSearch<DontQuantize, trie::ArrayBhiksha>, SortedVocabulary>;
-template class GenericModel<trie::TrieSearch<SeparatelyQuantize, trie::DontBhiksha>, SortedVocabulary>;
-template class GenericModel<trie::TrieSearch<SeparatelyQuantize, trie::ArrayBhiksha>, SortedVocabulary>;
-
-} // namespace detail
-
-base::Model *LoadVirtual(const char *file_name, const Config &config, ModelType model_type) {
- RecognizeBinary(file_name, model_type);
- switch (model_type) {
- case PROBING:
- return new ProbingModel(file_name, config);
- case REST_PROBING:
- return new RestProbingModel(file_name, config);
- case TRIE:
- return new TrieModel(file_name, config);
- case QUANT_TRIE:
- return new QuantTrieModel(file_name, config);
- case ARRAY_TRIE:
- return new ArrayTrieModel(file_name, config);
- case QUANT_ARRAY_TRIE:
- return new QuantArrayTrieModel(file_name, config);
- default:
- UTIL_THROW(FormatLoadException, "Confused by model type " << model_type);
- }
-}
-
-} // namespace ngram
-} // namespace lm
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/model.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/model.hh b/ext/kenlm/lm/model.hh
deleted file mode 100644
index b2bbe39..0000000
--- a/ext/kenlm/lm/model.hh
+++ /dev/null
@@ -1,155 +0,0 @@
-#ifndef LM_MODEL_H
-#define LM_MODEL_H
-
-#include "lm/bhiksha.hh"
-#include "lm/binary_format.hh"
-#include "lm/config.hh"
-#include "lm/facade.hh"
-#include "lm/quantize.hh"
-#include "lm/search_hashed.hh"
-#include "lm/search_trie.hh"
-#include "lm/state.hh"
-#include "lm/value.hh"
-#include "lm/vocab.hh"
-#include "lm/weights.hh"
-
-#include "util/murmur_hash.hh"
-
-#include <algorithm>
-#include <vector>
-#include <cstring>
-
-namespace util { class FilePiece; }
-
-namespace lm {
-namespace ngram {
-namespace detail {
-
-// Should return the same results as SRI.
-// ModelFacade typedefs Vocabulary so we use VocabularyT to avoid naming conflicts.
-template <class Search, class VocabularyT> class GenericModel : public base::ModelFacade<GenericModel<Search, VocabularyT>, State, VocabularyT> {
- private:
- typedef base::ModelFacade<GenericModel<Search, VocabularyT>, State, VocabularyT> P;
- public:
- // This is the model type returned by RecognizeBinary.
- static const ModelType kModelType;
-
- static const unsigned int kVersion = Search::kVersion;
-
- /* Get the size of memory that will be mapped given ngram counts. This
- * does not include small non-mapped control structures, such as this class
- * itself.
- */
- static uint64_t Size(const std::vector<uint64_t> &counts, const Config &config = Config());
-
- /* Load the model from a file. It may be an ARPA or binary file. Binary
- * files must have the format expected by this class or you'll get an
- * exception. So TrieModel can only load ARPA or binary created by
- * TrieModel. To classify binary files, call RecognizeBinary in
- * lm/binary_format.hh.
- */
- explicit GenericModel(const char *file, const Config &config = Config());
-
- /* Score p(new_word | in_state) and incorporate new_word into out_state.
- * Note that in_state and out_state must be different references:
- * &in_state != &out_state.
- */
- FullScoreReturn FullScore(const State &in_state, const WordIndex new_word, State &out_state) const;
-
- /* Slower call without in_state. Try to remember state, but sometimes it
- * would cost too much memory or your decoder isn't setup properly.
- * To use this function, make an array of WordIndex containing the context
- * vocabulary ids in reverse order. Then, pass the bounds of the array:
- * [context_rbegin, context_rend). The new_word is not part of the context
- * array unless you intend to repeat words.
- */
- FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const;
-
- /* Get the state for a context. Don't use this if you can avoid it. Use
- * BeginSentenceState or NullContextState and extend from those. If
- * you're only going to use this state to call FullScore once, use
- * FullScoreForgotState.
- * To use this function, make an array of WordIndex containing the context
- * vocabulary ids in reverse order. Then, pass the bounds of the array:
- * [context_rbegin, context_rend).
- */
- void GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const;
-
- /* More efficient version of FullScore where a partial n-gram has already
- * been scored.
- * NOTE: THE RETURNED .rest AND .prob ARE RELATIVE TO THE .rest RETURNED BEFORE.
- */
- FullScoreReturn ExtendLeft(
- // Additional context in reverse order. This will update add_rend to
- const WordIndex *add_rbegin, const WordIndex *add_rend,
- // Backoff weights to use.
- const float *backoff_in,
- // extend_left returned by a previous query.
- uint64_t extend_pointer,
- // Length of n-gram that the pointer corresponds to.
- unsigned char extend_length,
- // Where to write additional backoffs for [extend_length + 1, min(Order() - 1, return.ngram_length)]
- float *backoff_out,
- // Amount of additional content that should be considered by the next call.
- unsigned char &next_use) const;
-
- /* Return probabilities minus rest costs for an array of pointers. The
- * first length should be the length of the n-gram to which pointers_begin
- * points.
- */
- float UnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const {
- // Compiler should optimize this if away.
- return Search::kDifferentRest ? InternalUnRest(pointers_begin, pointers_end, first_length) : 0.0;
- }
-
- private:
- FullScoreReturn ScoreExceptBackoff(const WordIndex *const context_rbegin, const WordIndex *const context_rend, const WordIndex new_word, State &out_state) const;
-
- // Score bigrams and above. Do not include backoff.
- void ResumeScore(const WordIndex *context_rbegin, const WordIndex *const context_rend, unsigned char starting_order_minus_2, typename Search::Node &node, float *backoff_out, unsigned char &next_use, FullScoreReturn &ret) const;
-
- // Appears after Size in the cc file.
- void SetupMemory(void *start, const std::vector<uint64_t> &counts, const Config &config);
-
- void InitializeFromARPA(int fd, const char *file, const Config &config);
-
- float InternalUnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const;
-
- BinaryFormat backing_;
-
- VocabularyT vocab_;
-
- Search search_;
-};
-
-} // namespace detail
-
-// Instead of typedef, inherit. This allows the Model etc to be forward declared.
-// Oh the joys of C and C++.
-#define LM_COMMA() ,
-#define LM_NAME_MODEL(name, from)\
-class name : public from {\
- public:\
- name(const char *file, const Config &config = Config()) : from(file, config) {}\
-};
-
-LM_NAME_MODEL(ProbingModel, detail::GenericModel<detail::HashedSearch<BackoffValue> LM_COMMA() ProbingVocabulary>);
-LM_NAME_MODEL(RestProbingModel, detail::GenericModel<detail::HashedSearch<RestValue> LM_COMMA() ProbingVocabulary>);
-LM_NAME_MODEL(TrieModel, detail::GenericModel<trie::TrieSearch<DontQuantize LM_COMMA() trie::DontBhiksha> LM_COMMA() SortedVocabulary>);
-LM_NAME_MODEL(ArrayTrieModel, detail::GenericModel<trie::TrieSearch<DontQuantize LM_COMMA() trie::ArrayBhiksha> LM_COMMA() SortedVocabulary>);
-LM_NAME_MODEL(QuantTrieModel, detail::GenericModel<trie::TrieSearch<SeparatelyQuantize LM_COMMA() trie::DontBhiksha> LM_COMMA() SortedVocabulary>);
-LM_NAME_MODEL(QuantArrayTrieModel, detail::GenericModel<trie::TrieSearch<SeparatelyQuantize LM_COMMA() trie::ArrayBhiksha> LM_COMMA() SortedVocabulary>);
-
-// Default implementation. No real reason for it to be the default.
-typedef ::lm::ngram::ProbingVocabulary Vocabulary;
-typedef ProbingModel Model;
-
-/* Autorecognize the file type, load, and return the virtual base class. Don't
- * use the virtual base class if you can avoid it. Instead, use the above
- * classes as template arguments to your own virtual feature function.*/
-base::Model *LoadVirtual(const char *file_name, const Config &config = Config(), ModelType if_arpa = PROBING);
-
-} // namespace ngram
-} // namespace lm
-
-#endif // LM_MODEL_H
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/model_test.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/model_test.cc b/ext/kenlm/lm/model_test.cc
deleted file mode 100644
index d408d6f..0000000
--- a/ext/kenlm/lm/model_test.cc
+++ /dev/null
@@ -1,448 +0,0 @@
-#include "lm/model.hh"
-
-#include <cstdlib>
-#include <cstring>
-
-#define BOOST_TEST_MODULE ModelTest
-#include <boost/test/unit_test.hpp>
-#include <boost/test/floating_point_comparison.hpp>
-
-// Apparently some Boost versions use templates and are pretty strict about types matching.
-#define SLOPPY_CHECK_CLOSE(ref, value, tol) BOOST_CHECK_CLOSE(static_cast<double>(ref), static_cast<double>(value), static_cast<double>(tol));
-
-namespace lm {
-namespace ngram {
-
-std::ostream &operator<<(std::ostream &o, const State &state) {
- o << "State length " << static_cast<unsigned int>(state.length) << ':';
- for (const WordIndex *i = state.words; i < state.words + state.length; ++i) {
- o << ' ' << *i;
- }
- return o;
-}
-
-namespace {
-
-// Stupid bjam reverses the command line arguments randomly.
-const char *TestLocation() {
- if (boost::unit_test::framework::master_test_suite().argc < 3) {
- return "test.arpa";
- }
- char **argv = boost::unit_test::framework::master_test_suite().argv;
- return argv[strstr(argv[1], "nounk") ? 2 : 1];
-}
-const char *TestNoUnkLocation() {
- if (boost::unit_test::framework::master_test_suite().argc < 3) {
- return "test_nounk.arpa";
- }
- char **argv = boost::unit_test::framework::master_test_suite().argv;
- return argv[strstr(argv[1], "nounk") ? 1 : 2];
-}
-
-template <class Model> State GetState(const Model &model, const char *word, const State &in) {
- WordIndex context[in.length + 1];
- context[0] = model.GetVocabulary().Index(word);
- std::copy(in.words, in.words + in.length, context + 1);
- State ret;
- model.GetState(context, context + in.length + 1, ret);
- return ret;
-}
-
-#define StartTest(word, ngram, score, indep_left) \
- ret = model.FullScore( \
- state, \
- model.GetVocabulary().Index(word), \
- out);\
- SLOPPY_CHECK_CLOSE(score, ret.prob, 0.001); \
- BOOST_CHECK_EQUAL(static_cast<unsigned int>(ngram), ret.ngram_length); \
- BOOST_CHECK_GE(std::min<unsigned char>(ngram, 5 - 1), out.length); \
- BOOST_CHECK_EQUAL(indep_left, ret.independent_left); \
- BOOST_CHECK_EQUAL(out, GetState(model, word, state));
-
-#define AppendTest(word, ngram, score, indep_left) \
- StartTest(word, ngram, score, indep_left) \
- state = out;
-
-template <class M> void Starters(const M &model) {
- FullScoreReturn ret;
- Model::State state(model.BeginSentenceState());
- Model::State out;
-
- StartTest("looking", 2, -0.4846522, true);
-
- // , probability plus <s> backoff
- StartTest(",", 1, -1.383514 + -0.4149733, true);
- // <unk> probability plus <s> backoff
- StartTest("this_is_not_found", 1, -1.995635 + -0.4149733, true);
-}
-
-template <class M> void Continuation(const M &model) {
- FullScoreReturn ret;
- Model::State state(model.BeginSentenceState());
- Model::State out;
-
- AppendTest("looking", 2, -0.484652, true);
- AppendTest("on", 3, -0.348837, true);
- AppendTest("a", 4, -0.0155266, true);
- AppendTest("little", 5, -0.00306122, true);
- State preserve = state;
- AppendTest("the", 1, -4.04005, true);
- AppendTest("biarritz", 1, -1.9889, true);
- AppendTest("not_found", 1, -2.29666, true);
- AppendTest("more", 1, -1.20632 - 20.0, true);
- AppendTest(".", 2, -0.51363, true);
- AppendTest("</s>", 3, -0.0191651, true);
- BOOST_CHECK_EQUAL(0, state.length);
-
- state = preserve;
- AppendTest("more", 5, -0.00181395, true);
- BOOST_CHECK_EQUAL(4, state.length);
- AppendTest("loin", 5, -0.0432557, true);
- BOOST_CHECK_EQUAL(1, state.length);
-}
-
-template <class M> void Blanks(const M &model) {
- FullScoreReturn ret;
- State state(model.NullContextState());
- State out;
- AppendTest("also", 1, -1.687872, false);
- AppendTest("would", 2, -2, true);
- AppendTest("consider", 3, -3, true);
- State preserve = state;
- AppendTest("higher", 4, -4, true);
- AppendTest("looking", 5, -5, true);
- BOOST_CHECK_EQUAL(1, state.length);
-
- state = preserve;
- // also would consider not_found
- AppendTest("not_found", 1, -1.995635 - 7.0 - 0.30103, true);
-
- state = model.NullContextState();
- // higher looking is a blank.
- AppendTest("higher", 1, -1.509559, false);
- AppendTest("looking", 2, -1.285941 - 0.30103, false);
-
- State higher_looking = state;
-
- BOOST_CHECK_EQUAL(1, state.length);
- AppendTest("not_found", 1, -1.995635 - 0.4771212, true);
-
- state = higher_looking;
- // higher looking consider
- AppendTest("consider", 1, -1.687872 - 0.4771212, true);
-
- state = model.NullContextState();
- AppendTest("would", 1, -1.687872, false);
- BOOST_CHECK_EQUAL(1, state.length);
- AppendTest("consider", 2, -1.687872 -0.30103, false);
- BOOST_CHECK_EQUAL(2, state.length);
- AppendTest("higher", 3, -1.509559 - 0.30103, false);
- BOOST_CHECK_EQUAL(3, state.length);
- AppendTest("looking", 4, -1.285941 - 0.30103, false);
-}
-
-template <class M> void Unknowns(const M &model) {
- FullScoreReturn ret;
- State state(model.NullContextState());
- State out;
-
- AppendTest("not_found", 1, -1.995635, false);
- State preserve = state;
- AppendTest("not_found2", 2, -15.0, true);
- AppendTest("not_found3", 2, -15.0 - 2.0, true);
-
- state = preserve;
- AppendTest("however", 2, -4, true);
- AppendTest("not_found3", 3, -6, true);
-}
-
-template <class M> void MinimalState(const M &model) {
- FullScoreReturn ret;
- State state(model.NullContextState());
- State out;
-
- AppendTest("baz", 1, -6.535897, true);
- BOOST_CHECK_EQUAL(0, state.length);
- state = model.NullContextState();
- AppendTest("foo", 1, -3.141592, true);
- BOOST_CHECK_EQUAL(1, state.length);
- AppendTest("bar", 2, -6.0, true);
- // Has to include the backoff weight.
- BOOST_CHECK_EQUAL(1, state.length);
- AppendTest("bar", 1, -2.718281 + 3.0, true);
- BOOST_CHECK_EQUAL(1, state.length);
-
- state = model.NullContextState();
- AppendTest("to", 1, -1.687872, false);
- AppendTest("look", 2, -0.2922095, true);
- BOOST_CHECK_EQUAL(2, state.length);
- AppendTest("a", 3, -7, true);
-}
-
-template <class M> void ExtendLeftTest(const M &model) {
- State right;
- FullScoreReturn little(model.FullScore(model.NullContextState(), model.GetVocabulary().Index("little"), right));
- const float kLittleProb = -1.285941;
- SLOPPY_CHECK_CLOSE(kLittleProb, little.prob, 0.001);
- unsigned char next_use;
- float backoff_out[4];
-
- FullScoreReturn extend_none(model.ExtendLeft(NULL, NULL, NULL, little.extend_left, 1, NULL, next_use));
- BOOST_CHECK_EQUAL(0, next_use);
- BOOST_CHECK_EQUAL(little.extend_left, extend_none.extend_left);
- SLOPPY_CHECK_CLOSE(little.prob - little.rest, extend_none.prob, 0.001);
- BOOST_CHECK_EQUAL(1, extend_none.ngram_length);
-
- const WordIndex a = model.GetVocabulary().Index("a");
- float backoff_in = 3.14;
- // a little
- FullScoreReturn extend_a(model.ExtendLeft(&a, &a + 1, &backoff_in, little.extend_left, 1, backoff_out, next_use));
- BOOST_CHECK_EQUAL(1, next_use);
- SLOPPY_CHECK_CLOSE(-0.69897, backoff_out[0], 0.001);
- SLOPPY_CHECK_CLOSE(-0.09132547 - little.rest, extend_a.prob, 0.001);
- BOOST_CHECK_EQUAL(2, extend_a.ngram_length);
- BOOST_CHECK(!extend_a.independent_left);
-
- const WordIndex on = model.GetVocabulary().Index("on");
- FullScoreReturn extend_on(model.ExtendLeft(&on, &on + 1, &backoff_in, extend_a.extend_left, 2, backoff_out, next_use));
- BOOST_CHECK_EQUAL(1, next_use);
- SLOPPY_CHECK_CLOSE(-0.4771212, backoff_out[0], 0.001);
- SLOPPY_CHECK_CLOSE(-0.0283603 - (extend_a.rest + little.rest), extend_on.prob, 0.001);
- BOOST_CHECK_EQUAL(3, extend_on.ngram_length);
- BOOST_CHECK(!extend_on.independent_left);
-
- const WordIndex both[2] = {a, on};
- float backoff_in_arr[4];
- FullScoreReturn extend_both(model.ExtendLeft(both, both + 2, backoff_in_arr, little.extend_left, 1, backoff_out, next_use));
- BOOST_CHECK_EQUAL(2, next_use);
- SLOPPY_CHECK_CLOSE(-0.69897, backoff_out[0], 0.001);
- SLOPPY_CHECK_CLOSE(-0.4771212, backoff_out[1], 0.001);
- SLOPPY_CHECK_CLOSE(-0.0283603 - little.rest, extend_both.prob, 0.001);
- BOOST_CHECK_EQUAL(3, extend_both.ngram_length);
- BOOST_CHECK(!extend_both.independent_left);
- BOOST_CHECK_EQUAL(extend_on.extend_left, extend_both.extend_left);
-}
-
-#define StatelessTest(word, provide, ngram, score) \
- ret = model.FullScoreForgotState(indices + num_words - word, indices + num_words - word + provide, indices[num_words - word - 1], state); \
- SLOPPY_CHECK_CLOSE(score, ret.prob, 0.001); \
- BOOST_CHECK_EQUAL(static_cast<unsigned int>(ngram), ret.ngram_length); \
- model.GetState(indices + num_words - word, indices + num_words - word + provide, before); \
- ret = model.FullScore(before, indices[num_words - word - 1], out); \
- BOOST_CHECK(state == out); \
- SLOPPY_CHECK_CLOSE(score, ret.prob, 0.001); \
- BOOST_CHECK_EQUAL(static_cast<unsigned int>(ngram), ret.ngram_length);
-
-template <class M> void Stateless(const M &model) {
- const char *words[] = {"<s>", "looking", "on", "a", "little", "the", "biarritz", "not_found", "more", ".", "</s>"};
- const size_t num_words = sizeof(words) / sizeof(const char*);
- // Silience "array subscript is above array bounds" when extracting end pointer.
- WordIndex indices[num_words + 1];
- for (unsigned int i = 0; i < num_words; ++i) {
- indices[num_words - 1 - i] = model.GetVocabulary().Index(words[i]);
- }
- FullScoreReturn ret;
- State state, out, before;
-
- ret = model.FullScoreForgotState(indices + num_words - 1, indices + num_words, indices[num_words - 2], state);
- SLOPPY_CHECK_CLOSE(-0.484652, ret.prob, 0.001);
- StatelessTest(1, 1, 2, -0.484652);
-
- // looking
- StatelessTest(1, 2, 2, -0.484652);
- // on
- AppendTest("on", 3, -0.348837, true);
- StatelessTest(2, 3, 3, -0.348837);
- StatelessTest(2, 2, 3, -0.348837);
- StatelessTest(2, 1, 2, -0.4638903);
- // a
- StatelessTest(3, 4, 4, -0.0155266);
- // little
- AppendTest("little", 5, -0.00306122, true);
- StatelessTest(4, 5, 5, -0.00306122);
- // the
- AppendTest("the", 1, -4.04005, true);
- StatelessTest(5, 5, 1, -4.04005);
- // No context of the.
- StatelessTest(5, 0, 1, -1.687872);
- // biarritz
- StatelessTest(6, 1, 1, -1.9889);
- // not found
- StatelessTest(7, 1, 1, -2.29666);
- StatelessTest(7, 0, 1, -1.995635);
-
- WordIndex unk[1];
- unk[0] = 0;
- model.GetState(unk, unk + 1, state);
- BOOST_CHECK_EQUAL(1, state.length);
- BOOST_CHECK_EQUAL(static_cast<WordIndex>(0), state.words[0]);
-}
-
-template <class M> void NoUnkCheck(const M &model) {
- WordIndex unk_index = 0;
- State state;
-
- FullScoreReturn ret = model.FullScoreForgotState(&unk_index, &unk_index + 1, unk_index, state);
- SLOPPY_CHECK_CLOSE(-100.0, ret.prob, 0.001);
-}
-
-template <class M> void Everything(const M &m) {
- Starters(m);
- Continuation(m);
- Blanks(m);
- Unknowns(m);
- MinimalState(m);
- ExtendLeftTest(m);
- Stateless(m);
-}
-
-class ExpectEnumerateVocab : public EnumerateVocab {
- public:
- ExpectEnumerateVocab() {}
-
- void Add(WordIndex index, const StringPiece &str) {
- BOOST_CHECK_EQUAL(seen.size(), index);
- seen.push_back(std::string(str.data(), str.length()));
- }
-
- void Check(const base::Vocabulary &vocab) {
- BOOST_CHECK_EQUAL(37ULL, seen.size());
- BOOST_REQUIRE(!seen.empty());
- BOOST_CHECK_EQUAL("<unk>", seen[0]);
- for (WordIndex i = 0; i < seen.size(); ++i) {
- BOOST_CHECK_EQUAL(i, vocab.Index(seen[i]));
- }
- }
-
- void Clear() {
- seen.clear();
- }
-
- std::vector<std::string> seen;
-};
-
-template <class ModelT> void LoadingTest() {
- Config config;
- config.arpa_complain = Config::NONE;
- config.messages = NULL;
- config.probing_multiplier = 2.0;
- {
- ExpectEnumerateVocab enumerate;
- config.enumerate_vocab = &enumerate;
- ModelT m(TestLocation(), config);
- enumerate.Check(m.GetVocabulary());
- BOOST_CHECK_EQUAL((WordIndex)37, m.GetVocabulary().Bound());
- Everything(m);
- }
- {
- ExpectEnumerateVocab enumerate;
- config.enumerate_vocab = &enumerate;
- ModelT m(TestNoUnkLocation(), config);
- enumerate.Check(m.GetVocabulary());
- BOOST_CHECK_EQUAL((WordIndex)37, m.GetVocabulary().Bound());
- NoUnkCheck(m);
- }
-}
-
-BOOST_AUTO_TEST_CASE(probing) {
- LoadingTest<Model>();
-}
-BOOST_AUTO_TEST_CASE(trie) {
- LoadingTest<TrieModel>();
-}
-BOOST_AUTO_TEST_CASE(quant_trie) {
- LoadingTest<QuantTrieModel>();
-}
-BOOST_AUTO_TEST_CASE(bhiksha_trie) {
- LoadingTest<ArrayTrieModel>();
-}
-BOOST_AUTO_TEST_CASE(quant_bhiksha_trie) {
- LoadingTest<QuantArrayTrieModel>();
-}
-
-template <class ModelT> void BinaryTest(Config::WriteMethod write_method) {
- Config config;
- config.write_mmap = "test.binary";
- config.messages = NULL;
- config.write_method = write_method;
- ExpectEnumerateVocab enumerate;
- config.enumerate_vocab = &enumerate;
-
- {
- ModelT copy_model(TestLocation(), config);
- enumerate.Check(copy_model.GetVocabulary());
- enumerate.Clear();
- Everything(copy_model);
- }
-
- config.write_mmap = NULL;
-
- ModelType type;
- BOOST_REQUIRE(RecognizeBinary("test.binary", type));
- BOOST_CHECK_EQUAL(ModelT::kModelType, type);
-
- {
- ModelT binary("test.binary", config);
- enumerate.Check(binary.GetVocabulary());
- Everything(binary);
- }
- unlink("test.binary");
-
- // Now test without <unk>.
- config.write_mmap = "test_nounk.binary";
- config.messages = NULL;
- enumerate.Clear();
- {
- ModelT copy_model(TestNoUnkLocation(), config);
- enumerate.Check(copy_model.GetVocabulary());
- enumerate.Clear();
- NoUnkCheck(copy_model);
- }
- config.write_mmap = NULL;
- {
- ModelT binary(TestNoUnkLocation(), config);
- enumerate.Check(binary.GetVocabulary());
- NoUnkCheck(binary);
- }
- unlink("test_nounk.binary");
-}
-
-template <class ModelT> void BinaryTest() {
- BinaryTest<ModelT>(Config::WRITE_MMAP);
- BinaryTest<ModelT>(Config::WRITE_AFTER);
-}
-
-BOOST_AUTO_TEST_CASE(write_and_read_probing) {
- BinaryTest<ProbingModel>();
-}
-BOOST_AUTO_TEST_CASE(write_and_read_rest_probing) {
- BinaryTest<RestProbingModel>();
-}
-BOOST_AUTO_TEST_CASE(write_and_read_trie) {
- BinaryTest<TrieModel>();
-}
-BOOST_AUTO_TEST_CASE(write_and_read_quant_trie) {
- BinaryTest<QuantTrieModel>();
-}
-BOOST_AUTO_TEST_CASE(write_and_read_array_trie) {
- BinaryTest<ArrayTrieModel>();
-}
-BOOST_AUTO_TEST_CASE(write_and_read_quant_array_trie) {
- BinaryTest<QuantArrayTrieModel>();
-}
-
-BOOST_AUTO_TEST_CASE(rest_max) {
- Config config;
- config.arpa_complain = Config::NONE;
- config.messages = NULL;
-
- RestProbingModel model(TestLocation(), config);
- State state, out;
- FullScoreReturn ret(model.FullScore(model.NullContextState(), model.GetVocabulary().Index("."), state));
- SLOPPY_CHECK_CLOSE(-0.2705918, ret.rest, 0.001);
- SLOPPY_CHECK_CLOSE(-0.01916512, model.FullScore(state, model.GetVocabulary().EndSentence(), out).rest, 0.001);
-}
-
-} // namespace
-} // namespace ngram
-} // namespace lm
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/model_type.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/model_type.hh b/ext/kenlm/lm/model_type.hh
deleted file mode 100644
index dcdc6ac..0000000
--- a/ext/kenlm/lm/model_type.hh
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef LM_MODEL_TYPE_H
-#define LM_MODEL_TYPE_H
-
-namespace lm {
-namespace ngram {
-
-/* Not the best numbering system, but it grew this way for historical reasons
- * and I want to preserve existing binary files. */
-typedef enum {PROBING=0, REST_PROBING=1, TRIE=2, QUANT_TRIE=3, ARRAY_TRIE=4, QUANT_ARRAY_TRIE=5} ModelType;
-
-// Historical names.
-const ModelType HASH_PROBING = PROBING;
-const ModelType TRIE_SORTED = TRIE;
-const ModelType QUANT_TRIE_SORTED = QUANT_TRIE;
-const ModelType ARRAY_TRIE_SORTED = ARRAY_TRIE;
-const ModelType QUANT_ARRAY_TRIE_SORTED = QUANT_ARRAY_TRIE;
-
-const static ModelType kQuantAdd = static_cast<ModelType>(QUANT_TRIE - TRIE);
-const static ModelType kArrayAdd = static_cast<ModelType>(ARRAY_TRIE - TRIE);
-
-} // namespace ngram
-} // namespace lm
-#endif // LM_MODEL_TYPE_H
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/neural/Jamfile
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/neural/Jamfile b/ext/kenlm/lm/neural/Jamfile
deleted file mode 100644
index 14cd8e3..0000000
--- a/ext/kenlm/lm/neural/Jamfile
+++ /dev/null
@@ -1,6 +0,0 @@
-with-eigen = [ option.get "with-eigen" ] ;
-if ! $(with-eigen) && ! [ test_flags "" : "#include <Eigen/Dense>\nint main() {}" ] {
- with-eigen = "/usr/include/eigen3" ;
-}
-with-eigen = <include>$(with-eigen) ;
-fakelib neural : ..//kenlm wordvecs.cc : $(with-eigen) : : <cxxflags>-fopenmp <linkflags>-fopenmp $(with-eigen) ;
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/neural/wordvecs.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/neural/wordvecs.hh b/ext/kenlm/lm/neural/wordvecs.hh
deleted file mode 100644
index 921a2b2..0000000
--- a/ext/kenlm/lm/neural/wordvecs.hh
+++ /dev/null
@@ -1,38 +0,0 @@
-#ifndef LM_NEURAL_WORDVECS_H
-#define LM_NEURAL_WORDVECS_H
-
-#include "util/scoped.hh"
-#include "lm/vocab.hh"
-
-#include <Eigen/Dense>
-
-namespace util { class FilePiece; }
-
-namespace lm {
-namespace neural {
-
-class WordVecs {
- public:
- // Columns of the matrix are word vectors. The column index is the word.
- typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::ColMajor> Storage;
-
- /* The file should begin with a line stating the number of word vectors and
- * the length of the vectors. Then it's followed by lines containing a
- * word followed by floating-point values.
- */
- explicit WordVecs(util::FilePiece &in);
-
- const Storage &Vectors() const { return vecs_; }
-
- WordIndex Index(StringPiece str) const { return vocab_.Index(str); }
-
- private:
- util::scoped_malloc vocab_backing_;
- ngram::ProbingVocabulary vocab_;
-
- Storage vecs_;
-};
-
-}} // namespaces
-
-#endif // LM_NEURAL_WORDVECS_H