You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/19 21:33:59 UTC
[11/51] [partial] incubator-joshua git commit: Converted KenLM into a
submodule
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/pipeline.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/pipeline.cc b/ext/kenlm/lm/interpolate/pipeline.cc
deleted file mode 100644
index 47b8288..0000000
--- a/ext/kenlm/lm/interpolate/pipeline.cc
+++ /dev/null
@@ -1,159 +0,0 @@
-#include "lm/interpolate/pipeline.hh"
-
-#include "lm/common/compare.hh"
-#include "lm/common/print.hh"
-#include "lm/common/renumber.hh"
-#include "lm/vocab.hh"
-#include "lm/interpolate/backoff_reunification.hh"
-#include "lm/interpolate/interpolate_info.hh"
-#include "lm/interpolate/merge_probabilities.hh"
-#include "lm/interpolate/merge_vocab.hh"
-#include "lm/interpolate/normalize.hh"
-#include "lm/interpolate/universal_vocab.hh"
-#include "util/stream/chain.hh"
-#include "util/stream/count_records.hh"
-#include "util/stream/io.hh"
-#include "util/stream/multi_stream.hh"
-#include "util/stream/sort.hh"
-#include "util/fixed_array.hh"
-
-namespace lm { namespace interpolate { namespace {
-
-/* Put the original input files on chains and renumber them */
-void SetupInputs(std::size_t buffer_size, const UniversalVocab &vocab, util::FixedArray<ModelBuffer> &models, bool exclude_highest, util::FixedArray<util::stream::Chains> &chains, util::FixedArray<util::stream::ChainPositions> &positions) {
- chains.clear();
- positions.clear();
- // TODO: much better memory sizing heuristics e.g. not making the chain larger than it will use.
- util::stream::ChainConfig config(0, 2, buffer_size);
- for (std::size_t i = 0; i < models.size(); ++i) {
- chains.push_back(models[i].Order() - exclude_highest);
- for (std::size_t j = 0; j < models[i].Order() - exclude_highest; ++j) {
- config.entry_size = sizeof(WordIndex) * (j + 1) + sizeof(float) * 2; // TODO do not include wasteful backoff for highest.
- chains.back().push_back(config);
- }
- models[i].Source(chains.back());
- for (std::size_t j = 0; j < models[i].Order() - exclude_highest; ++j) {
- chains[i][j] >> Renumber(vocab.Mapping(i), j + 1);
- }
- }
- for (std::size_t i = 0; i < chains.size(); ++i) {
- positions.push_back(chains[i]);
- }
-}
-
-template <class SortOrder> void ApplySort(const util::stream::SortConfig &config, util::stream::Chains &chains) {
- util::stream::Sorts<SortOrder> sorts(chains.size());
- for (std::size_t i = 0; i < chains.size(); ++i) {
- sorts.push_back(chains[i], config, SortOrder(i + 1));
- }
- chains.Wait(true);
- // TODO memory management
- for (std::size_t i = 0; i < sorts.size(); ++i) {
- sorts[i].Merge(sorts[i].DefaultLazy());
- }
- for (std::size_t i = 0; i < sorts.size(); ++i) {
- sorts[i].Output(chains[i], sorts[i].DefaultLazy());
- }
-};
-
-} // namespace
-
-void Pipeline(util::FixedArray<ModelBuffer> &models, const Config &config, int write_file) {
- // Setup InterpolateInfo and UniversalVocab.
- InterpolateInfo info;
- info.lambdas = config.lambdas;
- std::vector<WordIndex> vocab_sizes;
-
- util::scoped_fd vocab_null(util::MakeTemp(config.sort.temp_prefix));
- std::size_t max_order = 0;
- util::FixedArray<util::scoped_fd> vocab_files(models.size());
- for (ModelBuffer *i = models.begin(); i != models.end(); ++i) {
- info.orders.push_back(i->Order());
- vocab_sizes.push_back(i->Counts()[0]);
- vocab_files.push_back(util::DupOrThrow(i->VocabFile()));
- max_order = std::max(max_order, i->Order());
- }
- UniversalVocab vocab(vocab_sizes);
- {
- ngram::ImmediateWriteWordsWrapper writer(NULL, vocab_null.get(), 0);
- MergeVocab(vocab_files, vocab, writer);
- }
- vocab_files.clear();
-
- std::cerr << "Merging probabilities." << std::endl;
- // Pass 1: merge probabilities
- util::FixedArray<util::stream::Chains> input_chains(models.size());
- util::FixedArray<util::stream::ChainPositions> models_by_order(models.size());
- SetupInputs(config.BufferSize(), vocab, models, false, input_chains, models_by_order);
-
- util::stream::Chains merged_probs(max_order);
- for (std::size_t i = 0; i < max_order; ++i) {
- merged_probs.push_back(util::stream::ChainConfig(PartialProbGamma::TotalSize(info, i + 1), 2, config.BufferSize())); // TODO: not buffer_size
- }
- MergeProbabilities(info, models_by_order, merged_probs);
- std::vector<uint64_t> counts(max_order);
- for (std::size_t i = 0; i < max_order; ++i) {
- merged_probs[i] >> util::stream::CountRecords(&counts[i]);
- }
-
- // Pass 2: normalize.
- ApplySort<ContextOrder>(config.sort, merged_probs);
- std::cerr << "Normalizing" << std::endl;
- SetupInputs(config.BufferSize(), vocab, models, true, input_chains, models_by_order);
- util::stream::Chains probabilities(max_order), backoffs(max_order - 1);
- std::size_t block_count = 2;
- for (std::size_t i = 0; i < max_order; ++i) {
- // Careful accounting to ensure RewindableStream can fit the entire vocabulary.
- block_count = std::max<std::size_t>(block_count, 2);
- // This much needs to fit in RewindableStream.
- std::size_t fit = NGram<float>::TotalSize(i + 1) * counts[0];
- // fit / (block_count - 1) rounded up
- std::size_t min_block = (fit + block_count - 2) / (block_count - 1);
- std::size_t specify = std::max(config.BufferSize(), min_block * block_count);
- probabilities.push_back(util::stream::ChainConfig(NGram<float>::TotalSize(i + 1), block_count, specify));
- }
- for (std::size_t i = 0; i < max_order - 1; ++i) {
- backoffs.push_back(util::stream::ChainConfig(sizeof(float), 2, config.BufferSize()));
- }
- Normalize(info, models_by_order, merged_probs, probabilities, backoffs);
-
- util::FixedArray<util::stream::FileBuffer> backoff_buffers(backoffs.size());
- for (std::size_t i = 0; i < max_order - 1; ++i) {
- backoff_buffers.push_back(util::MakeTemp(config.sort.temp_prefix));
- backoffs[i] >> backoff_buffers.back().Sink();
- }
-
- // Pass 3: backoffs in the right place.
- ApplySort<SuffixOrder>(config.sort, probabilities);
- // TODO destroy universal vocab to save RAM.
- // TODO these should be freed before merge sort happens in the above function.
- backoffs.Wait(true);
- merged_probs.Wait(true);
- std::cerr << "Reunifying backoffs" << std::endl;
-
- util::stream::ChainPositions prob_pos(max_order - 1);
- util::stream::Chains combined(max_order - 1);
- for (std::size_t i = 0; i < max_order - 1; ++i) {
- backoffs[i] >> backoff_buffers[i].Source(true);
- prob_pos.push_back(probabilities[i].Add());
- combined.push_back(util::stream::ChainConfig(NGram<ProbBackoff>::TotalSize(i + 1), 2, config.BufferSize()));
- }
- util::stream::ChainPositions backoff_pos(backoffs);
-
- ReunifyBackoff(prob_pos, backoff_pos, combined);
-
- util::stream::ChainPositions output_pos(max_order);
- for (std::size_t i = 0; i < max_order - 1; ++i) {
- output_pos.push_back(combined[i].Add());
- }
- output_pos.push_back(probabilities.back().Add());
-
- probabilities >> util::stream::kRecycle;
- backoffs >> util::stream::kRecycle;
- combined >> util::stream::kRecycle;
-
- // TODO genericize to ModelBuffer etc.
- PrintARPA(vocab_null.get(), write_file, counts).Run(output_pos);
-}
-
-}} // namespaces
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/pipeline.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/pipeline.hh b/ext/kenlm/lm/interpolate/pipeline.hh
deleted file mode 100644
index b200248..0000000
--- a/ext/kenlm/lm/interpolate/pipeline.hh
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef LM_INTERPOLATE_PIPELINE_H
-#define LM_INTERPOLATE_PIPELINE_H
-
-#include "lm/common/model_buffer.hh"
-#include "util/fixed_array.hh"
-#include "util/stream/config.hh"
-
-#include <cstddef>
-#include <string>
-
-namespace lm { namespace interpolate {
-
-struct Config {
- std::vector<float> lambdas;
- util::stream::SortConfig sort;
- std::size_t BufferSize() const { return sort.buffer_size; }
-};
-
-void Pipeline(util::FixedArray<ModelBuffer> &models, const Config &config, int write_file);
-
-}} // namespaces
-#endif // LM_INTERPOLATE_PIPELINE_H
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/split_worker.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/split_worker.cc b/ext/kenlm/lm/interpolate/split_worker.cc
deleted file mode 100644
index e777bf0..0000000
--- a/ext/kenlm/lm/interpolate/split_worker.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-#include "lm/interpolate/split_worker.hh"
-#include "lm/common/ngram.hh"
-
-namespace lm {
-namespace interpolate {
-
-SplitWorker::SplitWorker(std::size_t order, util::stream::Chain &backoff_chain,
- util::stream::Chain &sort_chain)
- : order_(order) {
- backoff_chain >> backoff_input_;
- sort_chain >> sort_input_;
-}
-
-void SplitWorker::Run(const util::stream::ChainPosition &position) {
- // input: ngram record (id, prob, and backoff)
- // output: a float to the backoff_input stream
- // an ngram id and a float to the sort_input stream
- for (util::stream::Stream stream(position); stream; ++stream) {
- NGram<ProbBackoff> ngram(stream.Get(), order_);
-
- // write id and prob to the sort stream
- float prob = ngram.Value().prob;
- lm::WordIndex *out = reinterpret_cast<lm::WordIndex *>(sort_input_.Get());
- for (const lm::WordIndex *it = ngram.begin(); it != ngram.end(); ++it) {
- *out++ = *it;
- }
- *reinterpret_cast<float *>(out) = prob;
- ++sort_input_;
-
- // write backoff to the backoff output stream
- float boff = ngram.Value().backoff;
- *reinterpret_cast<float *>(backoff_input_.Get()) = boff;
- ++backoff_input_;
- }
- sort_input_.Poison();
- backoff_input_.Poison();
-}
-
-}
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/split_worker.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/split_worker.hh b/ext/kenlm/lm/interpolate/split_worker.hh
deleted file mode 100644
index 15fae68..0000000
--- a/ext/kenlm/lm/interpolate/split_worker.hh
+++ /dev/null
@@ -1,44 +0,0 @@
-#ifndef KENLM_INTERPOLATE_SPLIT_WORKER_H_
-#define KENLM_INTERPOLATE_SPLIT_WORKER_H_
-
-#include "util/stream/chain.hh"
-#include "util/stream/stream.hh"
-
-namespace lm {
-namespace interpolate {
-
-class SplitWorker {
- public:
- /**
- * Constructs a split worker for a particular order. It writes the
- * split-off backoff values to the backoff chain and the ngram id and
- * probability to the sort chain for each ngram in the input.
- */
- SplitWorker(std::size_t order, util::stream::Chain &backoff_chain,
- util::stream::Chain &sort_chain);
-
- /**
- * The callback invoked to handle the input from the ngram intermediate
- * files.
- */
- void Run(const util::stream::ChainPosition& position);
-
- private:
- /**
- * The ngram order we are reading/writing for.
- */
- std::size_t order_;
-
- /**
- * The stream to write to for the backoff values.
- */
- util::stream::Stream backoff_input_;
-
- /**
- * The stream to write to for the ngram id + probability values.
- */
- util::stream::Stream sort_input_;
-};
-}
-}
-#endif
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/streaming_example_main.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/streaming_example_main.cc b/ext/kenlm/lm/interpolate/streaming_example_main.cc
deleted file mode 100644
index 1f543cb..0000000
--- a/ext/kenlm/lm/interpolate/streaming_example_main.cc
+++ /dev/null
@@ -1,195 +0,0 @@
-#include "lm/common/compare.hh"
-#include "lm/common/model_buffer.hh"
-#include "lm/common/ngram.hh"
-#include "util/stream/chain.hh"
-#include "util/stream/multi_stream.hh"
-#include "util/stream/sort.hh"
-#include "lm/interpolate/split_worker.hh"
-
-#include <boost/program_options.hpp>
-#include <boost/version.hpp>
-
-#if defined(_WIN32) || defined(_WIN64)
-
-// Windows doesn't define <unistd.h>
-//
-// So we define what we need here instead:
-//
-#define STDIN_FILENO = 0
-#define STDOUT_FILENO = 1
-#else // Huzzah for POSIX!
-#include <unistd.h>
-#endif
-
-/*
- * This is a simple example program that takes in intermediate
- * suffix-sorted ngram files and outputs two sets of files: one for backoff
- * probability values (raw numbers, in suffix order) and one for
- * probability values (ngram id and probability, in *context* order)
- */
-int main(int argc, char *argv[]) {
- using namespace lm::interpolate;
-
- const std::size_t ONE_GB = 1 << 30;
- const std::size_t SIXTY_FOUR_MB = 1 << 26;
- const std::size_t NUMBER_OF_BLOCKS = 2;
-
- std::string FILE_NAME = "ngrams";
- std::string CONTEXT_SORTED_FILENAME = "csorted-ngrams";
- std::string BACKOFF_FILENAME = "backoffs";
- std::string TMP_DIR = "/tmp/";
-
- try {
- namespace po = boost::program_options;
- po::options_description options("canhazinterp Pass-3 options");
-
- options.add_options()
- ("help,h", po::bool_switch(), "Show this help message")
- ("ngrams,n", po::value<std::string>(&FILE_NAME), "ngrams file")
- ("csortngrams,c", po::value<std::string>(&CONTEXT_SORTED_FILENAME), "context sorted ngrams file")
- ("backoffs,b", po::value<std::string>(&BACKOFF_FILENAME), "backoffs file")
- ("tmpdir,t", po::value<std::string>(&TMP_DIR), "tmp dir");
- po::variables_map vm;
- po::store(po::parse_command_line(argc, argv, options), vm);
-
- // Display help
- if(vm["help"].as<bool>()) {
- std::cerr << "Usage: " << options << std::endl;
- return 1;
- }
- }
- catch(const std::exception &e) {
-
- std::cerr << e.what() << std::endl;
- return 1;
-
- }
-
- // The basic strategy here is to have three chains:
- // - The first reads the ngram order inputs using ModelBuffer. Those are
- // then stripped of their backoff values and fed into the third chain;
- // the backoff values *themselves* are written to the second chain.
- //
- // - The second chain takes the backoff values and writes them out to a
- // file (one for each order).
- //
- // - The third chain takes just the probability values and ngrams and
- // writes them out, sorted in context-order, to a file (one for each
- // order).
-
- // This will be used to read in the binary intermediate files. There is
- // one file per order (e.g. ngrams.1, ngrams.2, ...)
- lm::ModelBuffer buffer(FILE_NAME);
-
- // Create a separate chains for each ngram order for:
- // - Input from the intermediate files
- // - Output to the backoff file
- // - Output to the (context-sorted) probability file
- util::stream::Chains ngram_inputs(buffer.Order());
- util::stream::Chains backoff_chains(buffer.Order());
- util::stream::Chains prob_chains(buffer.Order());
- for (std::size_t i = 0; i < buffer.Order(); ++i) {
- ngram_inputs.push_back(util::stream::ChainConfig(
- lm::NGram<lm::ProbBackoff>::TotalSize(i + 1), NUMBER_OF_BLOCKS, ONE_GB));
-
- backoff_chains.push_back(
- util::stream::ChainConfig(sizeof(float), NUMBER_OF_BLOCKS, ONE_GB));
-
- prob_chains.push_back(util::stream::ChainConfig(
- sizeof(lm::WordIndex) * (i + 1) + sizeof(float), NUMBER_OF_BLOCKS,
- ONE_GB));
- }
-
- // This sets the input for each of the ngram order chains to the
- // appropriate file
- buffer.Source(ngram_inputs);
-
- util::FixedArray<util::scoped_ptr<SplitWorker> > workers(buffer.Order());
- for (std::size_t i = 0; i < buffer.Order(); ++i) {
- // Attach a SplitWorker to each of the ngram input chains, writing to the
- // corresponding order's backoff and probability chains
- workers.push_back(
- new SplitWorker(i + 1, backoff_chains[i], prob_chains[i]));
- ngram_inputs[i] >> boost::ref(*workers.back());
- }
-
- util::stream::SortConfig sort_cfg;
- sort_cfg.temp_prefix = TMP_DIR;
- sort_cfg.buffer_size = SIXTY_FOUR_MB;
- sort_cfg.total_memory = ONE_GB;
-
- // This will parallel merge sort the individual order files, putting
- // them in context-order instead of suffix-order.
- //
- // Two new threads will be running, each owned by the chains[i] object.
- // - The first executes BlockSorter.Run() to sort the n-gram entries
- // - The second executes WriteAndRecycle.Run() to write each sorted
- // block to disk as a temporary file
- util::stream::Sorts<lm::ContextOrder> sorts(buffer.Order());
- for (std::size_t i = 0; i < prob_chains.size(); ++i) {
- sorts.push_back(prob_chains[i], sort_cfg, lm::ContextOrder(i + 1));
- }
-
- // Set the sort output to be on the same chain
- for (std::size_t i = 0; i < prob_chains.size(); ++i) {
- // The following call to Chain::Wait()
- // joins the threads owned by chains[i].
- //
- // As such the following call won't return
- // until all threads owned by chains[i] have completed.
- //
- // The following call also resets chain[i]
- // so that it can be reused
- // (including free'ing the memory previously used by the chain)
- prob_chains[i].Wait();
-
- // In an ideal world (without memory restrictions)
- // we could merge all of the previously sorted blocks
- // by reading them all completely into memory
- // and then running merge sort over them.
- //
- // In the real world, we have memory restrictions;
- // depending on how many blocks we have,
- // and how much memory we can use to read from each block
- // (sort_config.buffer_size)
- // it may be the case that we have insufficient memory
- // to read sort_config.buffer_size of data from each block from disk.
- //
- // If this occurs, then it will be necessary to perform one or more rounds
- // of merge sort on disk;
- // doing so will reduce the number of blocks that we will eventually
- // need to read from
- // when performing the final round of merge sort in memory.
- //
- // So, the following call determines whether it is necessary
- // to perform one or more rounds of merge sort on disk;
- // if such on-disk merge sorting is required, such sorting is performed.
- //
- // Finally, the following method launches a thread that calls
- // OwningMergingReader.Run()
- // to perform the final round of merge sort in memory.
- //
- // Merge sort could have be invoked directly
- // so that merge sort memory doesn't coexist with Chain memory.
- sorts[i].Output(prob_chains[i]);
- }
-
- // Create another model buffer for our output on e.g. csorted-ngrams.1,
- // csorted-ngrams.2, ...
- lm::ModelBuffer output_buf(CONTEXT_SORTED_FILENAME, true, false);
- output_buf.Sink(prob_chains, buffer.Counts());
-
- // Create a third model buffer for our backoff output on e.g. backoff.1,
- // backoff.2, ...
- lm::ModelBuffer boff_buf(BACKOFF_FILENAME, true, false);
- boff_buf.Sink(backoff_chains, buffer.Counts());
-
- // Joins all threads that chains owns,
- // and does a for loop over each chain object in chains,
- // calling chain.Wait() on each such chain object
- ngram_inputs.Wait(true);
- backoff_chains.Wait(true);
- prob_chains.Wait(true);
-
- return 0;
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/toy_data/toy.linear_interpolation.lambda1_0.4.lambda2_0.6.lm
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/toy_data/toy.linear_interpolation.lambda1_0.4.lambda2_0.6.lm b/ext/kenlm/lm/interpolate/toy_data/toy.linear_interpolation.lambda1_0.4.lambda2_0.6.lm
deleted file mode 100644
index f4bac3d..0000000
--- a/ext/kenlm/lm/interpolate/toy_data/toy.linear_interpolation.lambda1_0.4.lambda2_0.6.lm
+++ /dev/null
@@ -1,23 +0,0 @@
-
-\data\
-ngram 1=5
-ngram 2=8
-
-\1-grams:
--0.5850267 </s>
--99 <s> -7.004176
--0.7447274 <unk>
--0.4685211 a -7.272811
--0.6575773 b -99
-
-\2-grams:
--0.2839967 <s> a
--0.3187588 <s> b
--0.79588 a </s>
--0.5058454 a a
--0.2773661 a b
--0.7447275 b </s>
--0.1135093 b a
--0.9586073 b b
-
-\end\
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/toy_data/toy.loglinear_interpolation.lambda1_0.4.lambda2_0.6.lm
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/toy_data/toy.loglinear_interpolation.lambda1_0.4.lambda2_0.6.lm b/ext/kenlm/lm/interpolate/toy_data/toy.loglinear_interpolation.lambda1_0.4.lambda2_0.6.lm
deleted file mode 100644
index 9874ba9..0000000
--- a/ext/kenlm/lm/interpolate/toy_data/toy.loglinear_interpolation.lambda1_0.4.lambda2_0.6.lm
+++ /dev/null
@@ -1,23 +0,0 @@
-
-\data\
-ngram 1=5
-ngram 2=8
-
-\1-grams:
--0.6190513 </s>
--99 <s> -6.885247
--99 <unk>
--0.3394633 a 0
--0.5200813 b 0
-
-\2-grams:
--0.2807584 <s> a
--0.3222944 <s> b
--0.70763 a </s>
--0.6780174 a a
--0.2261675 a b
--0.7991036 b </s>
--0.1237299 b a
--1.050158 b b
-
-\end\
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/toy_data/toy1.lm
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/toy_data/toy1.lm b/ext/kenlm/lm/interpolate/toy_data/toy1.lm
deleted file mode 100644
index fba4263..0000000
--- a/ext/kenlm/lm/interpolate/toy_data/toy1.lm
+++ /dev/null
@@ -1,23 +0,0 @@
-
-\data\
-ngram 1=5
-ngram 2=8
-
-\1-grams:
--1.3010299957 </s>
--99 <s> -7.446389
--99 <unk>
--0.6020599913 a 0
--0.6020599913 b 0
-
-\2-grams:
--0.15490196 <s> a
--0.5228787453 <s> b
--1 a </s>
--1.5228787453 a a
--0.0604807474 a b
--0.5228787453 b </s>
--0.3010299957 b a
--0.6989700043 b b
-
-\end\
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/toy_data/toy2.lm
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/toy_data/toy2.lm b/ext/kenlm/lm/interpolate/toy_data/toy2.lm
deleted file mode 100644
index 67c2ba1..0000000
--- a/ext/kenlm/lm/interpolate/toy_data/toy2.lm
+++ /dev/null
@@ -1,23 +0,0 @@
-
-\data\
-ngram 1=5
-ngram 2=8
-
-\1-grams:
--0.3979400087 </s>
--99 <s> -7.446389
--99 <unk>
--0.3979400087 a 0
--0.6989700043 b 0
-
-\2-grams:
--0.3979400087 <s> a
--0.2218487496 <s> b
--0.6989700043 a </s>
--0.3010299957 a a
--0.5228787453 a b
--1 b </s>
--0.0222763947 b a
--1.3010299957 b b
-
-\end\
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/train_params_grant_main.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/train_params_grant_main.cc b/ext/kenlm/lm/interpolate/train_params_grant_main.cc
deleted file mode 100644
index a106cac..0000000
--- a/ext/kenlm/lm/interpolate/train_params_grant_main.cc
+++ /dev/null
@@ -1,561 +0,0 @@
-#include "lm/ngram_query.hh"
-#include "lm/model.hh"
-#include "lm/word_index.hh"
-#include "lm/interpolate/enumerate_global_vocab.hh"
-
-#include <string>
-#include <vector>
-#include <iostream>
-#include <fstream>
-#include <map>
-#include <iomanip>
-
-#include <boost/program_options.hpp>
-#include <boost/version.hpp>
-#include <boost/foreach.hpp>
-
-#include "util/fixed_array.hh"
-
-#include <Eigen/Eigen>
-#include <Eigen/Dense>
-
-// typedef Eigen::MatrixXf FMatrix;
-// typedef Eigen::VectorXf FVector;
-typedef Eigen::MatrixXd DMatrix;
-typedef Eigen::VectorXd DVector;
-
-bool HAS_BIAS = true;
-
-using namespace lm::ngram;
-using namespace lm;
-
-inline double logProb(Model *model, const std::vector<std::string> &ctx,
- WordIndex word_idx) {
- // Horribly inefficient
- const Vocabulary &vocab = model->GetVocabulary();
-
- State nextState; // throwaway
-
- WordIndex context_idx[ctx.size()];
-
- // reverse context
- for (std::size_t i = 0; i < ctx.size(); i++) {
- context_idx[ctx.size() - 1 - i] = vocab.Index(ctx[i]);
- }
-
- FullScoreReturn score = model->FullScoreForgotState(
- context_idx, &(context_idx[ctx.size() - 1]), word_idx, nextState);
-
- double ret = score.prob;
- // std::cerr << "w: " << word << " p: " << ret << std::endl;
- return ret;
-}
-
-inline double logProb(Model *model, double unkprob,
- const std::vector<std::string> &ctx,
- const std::string &word) {
- // Horribly inefficient
- const Vocabulary &vocab = model->GetVocabulary();
-
- WordIndex word_idx = vocab.Index(word);
- if (word_idx == lm::kUNK) return unkprob;
-
- return logProb(model, ctx, word_idx);
-}
-
-void set_features(const std::vector<std::string> &ctx, const std::string &word,
- const std::vector<Model *> &models,
- const std::vector<double> &unkprobs, DVector &v) {
- if (HAS_BIAS) {
- v(0) = 1;
- for (std::size_t i = 0; i < models.size(); ++i)
- v(i + 1) = logProb(models[i], unkprobs[i], ctx, word);
- } else {
- for (std::size_t i = 0; i < models.size(); ++i)
- v(i) = logProb(models[i], unkprobs[i], ctx, word);
- }
-}
-
-void train_params(const std::vector<std::vector<std::string> > &corpus,
- const std::vector<std::string> &vocab,
- const std::vector<Model *> &models) {
- using namespace std;
-
- // A safeguarded Newton's method to find optimum parameters.
- // Reverts to steepest-descent linesearch if Newton step does not improve
- // objective.
- //
- // Two Boolean variables below are used to "AllowExtrapolation" and
- // "AllowNegativeParams".
-
- bool AllowExtrapolation = true; // if true, params need not sum to one
- bool AllowNegativeParams = true; // if true, params can be negative
- const int ITERATIONS = 20;
- double minstepsize = 1.0e-9; // convergence criterion
- int context_size = 5; // (context_size+1)-grams considered in perplexity
- double stepdecreasefactor = 0.1; // if step unsuccessful
- double initstepsize = 1.0; // Initial step size
- std::size_t linesinstartercorpus = 12; // The first few lines are tuned
- // first, to find basin of attraction
- // for Newton
- // bias + #models
- const std::size_t nlambdas = models.size() + (HAS_BIAS ? 1 : 0);
-
- // initialize to sum to 1
- DVector params = DVector::Constant(nlambdas, 1.0 / nlambdas);
- DMatrix N = DMatrix::Constant(
- nlambdas, nlambdas - 1,
- -1.0 / sqrt((nlambdas - 1) * (nlambdas - 1) + nlambdas - 1.0));
- for (unsigned i = 0; i < nlambdas - 1; ++i)
- N(i, i) = N(i, i) * (1.0 - nlambdas);
- // N is nullspace matrix, each column sums to zero
-
- cerr << setprecision(16) << "++ Parameter training ++" << endl;
- if (AllowExtrapolation)
- cerr << " Allowing extrapolation (sharpening and flattening of individual "
- "LM distributions)" << endl;
- else
- cerr << " Interpolating only (not sharpening or flattening individual LM "
- "distributions)" << endl;
- if (AllowNegativeParams)
- cerr << " Allowing negative parameters\n"
- << " (more general but slow and rarely useful\n"
- << " -LM with negative weight has probability rankings reversed and "
- "is weighted more highly than all LMs with positive weights)"
- << endl;
- else
- cerr << "Not allowing negative parameters (mild assumption, and faster)"
- << endl;
- cerr << " Maximum number of iterations: " << ITERATIONS << endl;
- cerr << " Minimum step size: " << minstepsize << endl;
- cerr << " Perplexity computed with " << context_size + 1 << "-grams" << endl;
-
- if ((!AllowExtrapolation) && (nlambdas == 1)) {
- // Nothing to optimize. One parameter, and it sums to one.
- cerr << "Training complete. Best weights:" << endl;
- cerr << setprecision(16) << 1.0 << endl;
- return;
- }
-
- // Smart initialization of full tuning by tuning on smaller set first
- vector<std::size_t> linestotune;
- if (linesinstartercorpus < corpus.size())
- linestotune.push_back(linesinstartercorpus);
- linestotune.push_back(corpus.size());
-
- for (std::size_t setiter = 0; setiter < linestotune.size(); ++setiter) {
- cerr << " Now tuning the first " << linestotune[setiter] << " lines"
- << endl;
-
- vector<DVector> paramhistory;
- double bestppl = 0.0; // best recorded ppl
- DVector bestgrad = DVector::Zero(nlambdas); // corresp. gradient,
- // feasible direction
- DVector bestparams = DVector::Zero(nlambdas); // corresp. weights
- double maxbestgradstep = 0.0; // max feasible step in grad. direction
- double stepsize = initstepsize; // Initial step size
-
- for (int iter = 0; iter < ITERATIONS; ++iter) { // iterations
- cerr << "ITERATION " << iter + 1 << " (of max " << ITERATIONS
- << "), step size " << stepsize << " (of min " << minstepsize
- << "), weights: " << endl;
- cerr << params << endl;
-
- paramhistory.push_back(params);
- // Hard-coded to be 6-gram perplexity
- vector<string> context(context_size, "<s>");
- double ppl = 0.0;
- DVector grad = DVector::Zero(nlambdas);
- DMatrix H = DMatrix::Zero(nlambdas, nlambdas);
- cerr << "o";
-
- std::vector<double> unkprobs(models.size());
- // for each sentence in tuning corpus
- for (std::size_t ci = 0; ci < linestotune[setiter]; ++ci) {
- const vector<string> &sentence = corpus[ci];
- // pad our beginning context
- std::fill(context.begin(), context.end(), "<s>");
-
- // for each word in sentence
- for (std::size_t t = 0; t < sentence.size(); ++t) {
- // fill in unk probabilities for this context, to avoid having to
- // look them up redundantly later
- for (std::size_t mi = 0; mi < models.size(); ++mi) {
- unkprobs[mi] = logProb(models[mi], context, lm::kUNK);
- }
-
- DVector feats = DVector::Zero(nlambdas);
- // probs for actual n-gram
- set_features(context, sentence[t], models, unkprobs, feats);
-
- double z = 0.0;
- double maxlogprob = 0.0; // Allows us to avoid overflow with
- // negative params
- DVector expectfeats = DVector::Zero(nlambdas);
- DMatrix expectfeatmatrix = DMatrix::Zero(nlambdas, nlambdas);
- // Logically, this should be in the loop's scope
- DVector iterfeats = DVector::Zero(nlambdas);
-
- // probs over possible n-grams, for normalization
- for (std::size_t i = 0; i < vocab.size(); ++i) {
- set_features(context, vocab[i], models, unkprobs, iterfeats);
- double logprob = params.dot(iterfeats);
- if (i == 0) {
- // maxlogprob=logprob;// more precise, less underflow
- maxlogprob = 0.0; // reduces number of updates
- } else if (logprob > maxlogprob) {
- // Adjust all old values to new scaling
- double adjust = exp(maxlogprob - logprob);
- z *= adjust;
- expectfeats *= adjust;
- expectfeatmatrix *= adjust;
- maxlogprob = logprob;
- }
- double us = exp(params.dot(iterfeats) - maxlogprob); // measure
-
- z += us;
- expectfeats += us * iterfeats;
- expectfeatmatrix += us * (iterfeats * iterfeats.transpose());
- }
- expectfeats /= z; // Expectation
- expectfeatmatrix /= z; // Expectation
-
- // Add sentence[t] to the end of the context
- context[0] = sentence[t];
- std::rotate(context.begin(), context.begin() + 1, context.end());
-
- // Perplexity (actually log(perplexity))
- ppl += params.dot(feats) - log(z);
- // Gradient
- grad += feats - expectfeats;
- // Hessian
- H += -expectfeatmatrix + expectfeats * expectfeats.transpose();
- }
- cerr << ".";
- }
- ppl *= -1.0 / corpus.size();
- // The gradient and Hessian coefficients cancel out, so don't really need
- // to do this, but it's fast.
- grad *= -1.0 / corpus.size();
- H *= -1.0 / corpus.size();
- cerr << " log(PPL)=" << ppl << " PPL=" << exp(ppl) << endl;
-
- // Use results to determine next params to evaluate
- if ((ppl < bestppl) || (iter == 0)) {
- // Found a new best
- bestppl = ppl;
- bestparams = params;
- double beststepsize = stepsize;
- if (iter > 0)
- cerr << " New best point found, step size " << beststepsize << endl;
- else
- cerr << " New best point found" << endl;
-
- bestgrad = grad;
- DVector deltaparams = DVector::Zero(nlambdas);
-
- bool reverttograd = false;
-
- {
- double gradnorm = 0.0;
- double solvenorm = 0.0;
- double errnorm = 0.0;
- // Find Newton step
- if (AllowExtrapolation) {
- deltaparams = -H.colPivHouseholderQr().solve(grad);
- Eigen::SelfAdjointEigenSolver<DMatrix> eigensolver(H);
- cerr << "Eigenvalues (negative values should be negligible):\n"
- << eigensolver.eigenvalues() << endl;
- gradnorm = grad.norm();
- solvenorm = (H * deltaparams).norm();
- errnorm = (grad + H * deltaparams).norm();
- } else {
- // Project gradient to interpolation space
- bestgrad = N * N.transpose() * bestgrad;
-
- // need to work in nullspace to maintain unit sum
- DMatrix Hnull = DMatrix::Zero(nlambdas - 1, nlambdas - 1);
-
- // Looks like we don't need the three lines below -- we can do it
- // in-line (if we don't want eigenvalues)
- Hnull = N.transpose() * H * N;
- Eigen::SelfAdjointEigenSolver<DMatrix> eigensolver(Hnull);
- cerr << "Eigenvalues (best if all positive):\n"
- << eigensolver.eigenvalues() << endl;
- deltaparams =
- -N * Hnull.fullPivHouseholderQr().solve(N.transpose() * grad);
- gradnorm = (N.transpose() * grad).norm();
- solvenorm = (Hnull * deltaparams).norm();
- errnorm = (N.transpose() * grad + Hnull * deltaparams).norm();
- }
- // eventually, params = bestparams + deltaparams;
- cerr << " Error norm " << errnorm << ", gradient norm " << gradnorm
- << ", solution norm " << solvenorm << endl;
- // Check for numerical errors. Don't trust Newton step if they are too
- // big.
- if (errnorm < 1e-12 * std::max(1.0, std::min(gradnorm, solvenorm))) {
- stepsize = 0.0;
- for (std::size_t i = 0; i < nlambdas; i++)
- stepsize += deltaparams(i) * deltaparams(i);
- stepsize = sqrt(stepsize); // holds length of Newton step
- cerr << "Newton step, length " << stepsize << ": " << endl;
- cerr << deltaparams << endl;
-
- // Don't let the Newton step get much bigger than last successful
- // step (likely would have to shrink later, anyway)
- if (stepsize > 2.0 * beststepsize) {
- stepsize = 1.5 * beststepsize;
- reverttograd = true;
- cerr << "Reverting to gradient, because Newton step is too large."
- << endl;
- }
- } else {
- stepsize = 1.5 * beststepsize;
- reverttograd = true;
- cerr << "Reverting to gradient, because Newton step computation "
- "unsuccessful." << endl;
- }
- // Make the gradient unit norm, in feasible search direction.
- if (!AllowNegativeParams) {
- // Project gradient to be a feasible search direction
- vector<bool> active(nlambdas, false);
- std::size_t numactive = 0;
- for (std::size_t i = 0; i < nlambdas; i++) {
- // Project gradient to inactive constraints
- if ((bestparams(i) == 0) && (bestgrad(i) > 0)) {
- active[i] = true;
- bestgrad(i) = 0.0; // Do this now, in case extrapolation
- // allowed.
- ++numactive;
- }
- }
- if (numactive > 0) {
- if (!AllowExtrapolation) {
- // Project gradient, for activity concerns
- DMatrix tmpN = DMatrix::Constant(
- nlambdas, nlambdas - 1,
- -1.0 / sqrt((nlambdas - numactive - 1) *
- (nlambdas - numactive - 1) +
- nlambdas - numactive - 1.0));
-
- for (std::size_t i = 0; i < nlambdas - 1; ++i)
- tmpN(i, i) = tmpN(i, i) * (1.0 - (nlambdas - numactive));
-
- for (std::size_t i = 0; i < nlambdas; ++i) {
- if (active[i]) {
- for (std::size_t j = 0; j < nlambdas - 1; ++i) {
- tmpN(i, j) = 0;
- }
- }
- }
-
- // projected gradient onto unit sum and active set constraints
- bestgrad = -tmpN * tmpN.transpose() * bestgrad;
- }
- }
- }
- }
- double norm = 0.0;
- for (std::size_t i = 0; i < nlambdas; i++)
- norm += bestgrad(i) * bestgrad(i);
- if (norm != 0) {
- bestgrad /= sqrt(norm);
- } else {
- cerr << " Gradient is zero. Exiting.";
- break;
- }
- cerr << "Gradient, unit length: " << endl;
- cerr << bestgrad << endl;
-
- // Find max step in gradient direction that remains feasible.
- if (!AllowNegativeParams) {
- double limitfraction = 0.5; // Not 1: If Newton step is bad, probably
- // will need to reduce later anyway
- for (std::size_t i = 0; i < nlambdas; i++) {
- if (bestparams(i) - maxbestgradstep * bestgrad(i) < 0) {
- double tmplimitfraction =
- bestparams(i) / (bestgrad(i) * maxbestgradstep);
- if (tmplimitfraction < limitfraction)
- limitfraction = tmplimitfraction;
- }
- }
- maxbestgradstep = stepsize * limitfraction;
- cerr << " Max grad step: " << maxbestgradstep << endl;
- } else {
- maxbestgradstep = stepsize;
- }
-
- if (!reverttograd) {
- if (!AllowNegativeParams) {
- for (std::size_t i = 0; i < nlambdas; i++) {
- if (bestparams(i) + deltaparams(i) < 0) {
- // Can't do Newton step. Revert to descent.
- reverttograd = true;
- }
- }
- }
- if (reverttograd) {
- cerr << "Reverting to gradient, since Newton step infeasible:"
- << endl;
- }
- }
-
- if (reverttograd) {
- stepsize = maxbestgradstep;
- deltaparams = -bestgrad * stepsize;
- }
-
- params = bestparams + deltaparams;
- cerr << "Change in weights from best, step size " << stepsize << ": "
- << endl;
- cerr << deltaparams << endl;
- } else {
- // Last attempt failed at being better, so move in gradient direction
- // with reduced step.
- // stepsize reduction factor is empirical
- stepsize = std::min(stepdecreasefactor * stepsize, maxbestgradstep);
- cerr << "Taking smaller step: " << stepsize << endl;
- params = bestparams - bestgrad * stepsize;
- }
- // Clean the parameters up.
- double sumparams = 0.0;
- for (std::size_t i = 0; i < nlambdas; i++) {
- if (!AllowNegativeParams) {
- if (params(i) < 1e-12) {
- // snap to zero, for active set and duplicate weights
- params(i) = 0;
- }
- }
- sumparams += params(i);
- }
- if (!AllowExtrapolation) params /= sumparams;
-
- bool duplicateentry = false;
- for (std::size_t i = 0; i < paramhistory.size(); ++i) {
- if (params == paramhistory[i]) duplicateentry = true;
- }
- while ((duplicateentry) && (stepsize >= minstepsize)) {
- cerr << "Duplicate weight found: " << endl;
- cerr << params << endl;
- stepsize *= 0.5; // Step in this direction is duplicate, so try again
- // with smaller step
- params = bestparams - stepsize * bestgrad;
-
- sumparams = 0.0;
- for (std::size_t i = 0; i < nlambdas; i++) {
- if (!AllowNegativeParams) {
- if (params(i) < 1e-12) params(i) = 0;
- }
- sumparams += params(i);
- }
- if (!AllowExtrapolation) params /= sumparams;
-
- duplicateentry = false;
- for (std::size_t i = 0; i < paramhistory.size(); ++i) {
- if (params == paramhistory[i]) duplicateentry = true;
- }
- }
- if (stepsize < minstepsize) break; // No need to make another step
- }
-
- params = bestparams; // So that next setiter is correct
- cerr << "Training complete. Best weights:" << endl;
- cerr << params << endl;
- }
-}
-
-int main(int argc, char **argv) {
- std::string tuning_data;
- std::vector<std::string> lms;
-
- try {
- namespace po = boost::program_options;
- po::options_description options("train-params");
-
- options.add_options()("help,h", po::bool_switch(),
- "Show this help message")(
- "no_bias_term,B", po::bool_switch(), "Do not include a 'bias' feature")(
- "tuning_data,t", po::value<std::string>(&tuning_data),
- "File to tune perplexity on")(
- "model,m", po::value<std::vector<std::string> >(&lms),
- "Language models in KenLM format to interpolate");
- po::variables_map vm;
- po::store(po::parse_command_line(argc, argv, options), vm);
-
- // Display help
- if (argc == 1 || vm["help"].as<bool>()) {
- std::cerr << options << std::endl;
- return 1;
- }
- if (vm["no_bias_term"].as<bool>()) HAS_BIAS = false;
- lms = vm["model"].as<std::vector<std::string> >();
- tuning_data = vm["tuning_data"].as<std::string>();
- } catch (const std::exception &e) {
- std::cerr << e.what() << std::endl;
- return 1;
- }
- if (lms.size() < 2) {
- std::cerr
- << "Please specify at least two language model files with -m LM.KLM\n";
- return 1;
- }
- if (tuning_data.empty()) {
- std::cerr << "Please specify tuning set with -t FILE.TXT\n";
- return 1;
- }
-
- // Growable vocab here
- // GrowableVocab gvoc(100000); //dummy default
-
- // no comment
- std::map<std::string, int *> vmap;
-
- // stuff it into the
- EnumerateGlobalVocab *globalVocabBuilder =
- new EnumerateGlobalVocab(&vmap, lms.size());
-
- Config cfg;
- cfg.enumerate_vocab = (EnumerateVocab *)globalVocabBuilder;
-
- // load models
- std::vector<Model *> models;
- for (std::size_t i = 0; i < lms.size(); i++) {
- std::cerr << "Loading LM file: " << lms[i] << std::endl;
-
- // haaaack
- globalVocabBuilder->SetCurModel(i); // yes this is dumb
-
- Model *this_model = new Model(lms[i].c_str(), cfg);
- models.push_back(this_model);
- }
-
- // assemble vocabulary vector
- std::vector<std::string> vocab;
- std::cerr << "Global Vocab Map has size: " << vmap.size() << std::endl;
-
- for (std::map<std::string, int *>::iterator iter = vmap.begin();
- iter != vmap.end(); ++iter) {
- vocab.push_back(iter->first);
- }
- std::cerr << "Vocab vector has size: " << vocab.size() << std::endl;
-
- // load context sorted ngrams into vector of vectors
- std::vector<std::vector<std::string> > corpus;
-
- std::cerr << "Loading context-sorted ngrams: " << tuning_data << std::endl;
- std::ifstream infile(tuning_data.c_str());
-
- for (std::string line; std::getline(infile, line);) {
- std::vector<std::string> words;
- std::stringstream stream(line);
- std::string word;
-
- while (stream >> word)
- words.push_back(word);
- corpus.push_back(words);
- }
-
- train_params(corpus, vocab, models);
-
- return 0;
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/train_params_main.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/train_params_main.cc b/ext/kenlm/lm/interpolate/train_params_main.cc
deleted file mode 100644
index 39233e8..0000000
--- a/ext/kenlm/lm/interpolate/train_params_main.cc
+++ /dev/null
@@ -1,349 +0,0 @@
-#include "lm/ngram_query.hh"
-#include "lm/model.hh"
-#include "lm/word_index.hh"
-#include "lm/interpolate/enumerate_global_vocab.hh"
-
-
-#include <string>
-#include <vector>
-#include <iostream>
-#include <fstream>
-#include <map>
-
-#include <boost/program_options.hpp>
-#include <boost/version.hpp>
-#include <boost/foreach.hpp>
-
-#include "util/fixed_array.hh"
-
-#include <Eigen/Eigen>
-
-typedef Eigen::MatrixXf FMatrix;
-typedef Eigen::VectorXf FVector;
-
-bool HAS_BIAS = true;
-
-using namespace lm::ngram;
-using namespace lm;
-
-inline float logProb(Model * model, const std::vector<std::string>& ctx, const std::string& word) {
-
- // Horribly inefficient
- const Vocabulary &vocab = model->GetVocabulary();
-
- State nextState; //throwaway
-
- WordIndex word_idx = vocab.Index(word);
- WordIndex context_idx[ctx.size()];
-
- //reverse context
- for(unsigned int i = 0; i < ctx.size(); i++) {
- context_idx[ctx.size() - 1 - i] = vocab.Index(ctx[i]);
- }
-
- FullScoreReturn score = model->FullScoreForgotState(context_idx, &(context_idx[ctx.size() -1]), word_idx, nextState);
-
- float ret = score.prob;
- //std::cerr << "w: " << word << " p: " << ret << std::endl;
- return ret;
-}
-
-void set_features(const std::vector<std::string>& ctx,
- const std::string& word,
- const std::vector<Model *>& models,
- FVector& v) {
-
- //std::cerr << "setting feats for " << word << std::endl;
-
- if (HAS_BIAS) {
- v(0) = 1;
- for (unsigned i=0; i < models.size(); ++i)
- v(i + 1) = logProb(models[i], ctx, word);
- } else {
- for (unsigned i=0; i < models.size(); ++i)
- v(i) = logProb(models[i], ctx, word);
- }
-}
-
-void translate_input(
- const std::vector<std::vector<std::string> >& corpus,
- const std::vector<std::string>& gvocab,
- const std::vector<Model *>& models,
- std::vector<std::vector<std::vector<WordIndex> > >&translated_corpus,
- std::vector<std::vector<WordIndex> >&translated_vocab
- ) {
- translated_corpus.resize(models.size());
- translated_vocab.resize(models.size());
- for (unsigned mn=0; mn < models.size(); ++mn) { // models
-
- const Vocabulary &vocab = models[mn]->GetVocabulary();
-
- for (unsigned i = 0; i < gvocab.size(); ++i) {
- translated_vocab[mn].push_back(vocab.Index(gvocab[i]));
- }
-
- translated_corpus[mn].resize(corpus.size());
- for (unsigned ci = 0; ci < corpus.size(); ++ci) { // sentences in tuning corpus
- const std::vector<std::string>& sentence = corpus[ci];
- for (int t = sentence.size() -1; t >= 0; --t) { // words in sentence
- translated_corpus[mn][ci].push_back(vocab.Index(sentence[t]));
- }
- for (int i=0; i<5; ++i) {
- translated_corpus[mn][ci].push_back(vocab.Index("<s>"));
- }
- }
- }
-}
-
-
-void train_params_fast(
- const std::vector<std::vector<std::string> >& corpus,
- const std::vector<std::string>& vocab,
- const std::vector<Model *>& models) {
- using namespace std;
-
- // model / sentence / words in sentence in reverse order with <s> padding
- std::vector<std::vector<std::vector<WordIndex> > > t_corpus;
- std::vector<std::vector<WordIndex> > t_vocab;
- translate_input(corpus, vocab, models, t_corpus, t_vocab);
-
-
-
- const int ITERATIONS = 10;
- const int nlambdas = models.size() + (HAS_BIAS ? 1 : 0); // bias + #models
- FVector params = FVector::Zero(nlambdas);
- vector<FVector> feats(vocab.size(), params);
- vector<float> us(vocab.size(), 0);
- vector<float> ps(vocab.size(), 0);
- FVector grad = FVector::Zero(nlambdas);
- FMatrix H = FMatrix::Zero(nlambdas, nlambdas);
- FVector ef = FVector::Zero(nlambdas);
- for (int iter = 0; iter < ITERATIONS; ++iter) { // iterations
- grad.setZero();
- H.setZero();
- double loss = 0;
- unsigned numchars = 0;
- for (unsigned ci = 0; ci < corpus.size(); ++ci) { // sentences in tuning corpus
- const vector<string>& sentence = corpus[ci];
- double z = 0;
- for (int t = sentence.size() -1 ; t >=0; --t) { // words in sentence
- ++numchars;
- int ref_word = 0;
- for (unsigned i = 0; i < vocab.size(); ++i) { // vocab
- // set_features(context, vocab[i], models, feats[i]);
- for (unsigned j=0; j < models.size(); ++j) {
- // NOTE: reference ---- WordIndex word_idx = t_corpus[j][ci][t];
- WordIndex word_idx = t_vocab[j][i];
- State nextState; //throwaway
- FullScoreReturn score = models[j]->FullScoreForgotState(&(t_corpus[j][ci][t]), &(t_corpus[j][ci][t+5]), word_idx, nextState);
- feats[i](j) = score.prob;
- // feats[i](j) = logProb(models[j], ctx, word);
- }
-
- us[i] = params.dot(feats[i]);
- z += exp(double(us[i]));
- }
- //std::cerr << "there..." << std::endl;
- const float logz = log(z);
-
- // expected feature values
- ef.setZero();
- for (unsigned i = 0; i < vocab.size(); ++i) {
- ps[i] = expf(us[i] - logz);
- ef += ps[i] * feats[i];
- }
- loss -= log(ps[ref_word]);
- const FVector& reffeats = feats[ref_word];
- grad += ef - reffeats;
-
- // Hessian
- for (unsigned i = 0; i < vocab.size(); ++i)
- H.noalias() += ps[i] * feats[i] * feats[i].transpose() -
- ps[i] * feats[i] * ef.transpose();
-
- // this should just be the state for each model
- }
- cerr << ".";
- }
- cerr << "ITERATION " << (iter + 1) << ": PPL=" << exp(loss / numchars) << endl;
- params = H.colPivHouseholderQr().solve(grad);
- cerr << params << endl;
- }
-}
-
-
-
-
-//const util::FixedArray<Model *>& models)
-void train_params(
- const std::vector<std::vector<std::string> >& corpus,
- const std::vector<std::string>& vocab,
- const std::vector<Model *>& models) {
- using namespace std;
-
- vector<string> context(5, "<s>");
- const int ITERATIONS = 10;
- const int nlambdas = models.size() + (HAS_BIAS ? 1 : 0); // bias + #models
- FVector params = FVector::Zero(nlambdas);
- vector<FVector> feats(vocab.size(), params);
- vector<float> us(vocab.size(), 0);
- vector<float> ps(vocab.size(), 0);
- FVector grad = FVector::Zero(nlambdas);
- FMatrix H = FMatrix::Zero(nlambdas, nlambdas);
- FVector ef = FVector::Zero(nlambdas);
- for (int iter = 0; iter < ITERATIONS; ++iter) { // iterations
- grad.setZero();
- H.setZero();
- double loss = 0;
- unsigned numchars = 0;
- for (unsigned ci = 0; ci < corpus.size(); ++ci) { // sentences in tuning corpus
- const vector<string>& sentence = corpus[ci];
- std::fill(context.begin(), context.end(), "<s>");
- for (unsigned t = 0; t < sentence.size(); ++t) { // words in sentence
- ++numchars;
- const string& ref_word_string = sentence[t];
- int ref_word = 0; // TODO
- double z = 0;
- //std::cerr << "here..." << std::endl;
- for (unsigned i = 0; i < vocab.size(); ++i) { // vocab
- set_features(context, vocab[i], models, feats[i]);
- us[i] = params.dot(feats[i]);
- z += exp(double(us[i]));
- }
- //std::cerr << "there..." << std::endl;
- context.push_back(ref_word_string);
- const float logz = log(z);
-
- // expected feature values
- ef.setZero();
- for (unsigned i = 0; i < vocab.size(); ++i) {
- ps[i] = expf(us[i] - logz);
- ef += ps[i] * feats[i];
- }
- loss -= log(ps[ref_word]);
- const FVector& reffeats = feats[ref_word];
- grad += ef - reffeats;
-
- // Hessian
- for (unsigned i = 0; i < vocab.size(); ++i)
- H.noalias() += ps[i] * feats[i] * feats[i].transpose() -
- ps[i] * feats[i] * ef.transpose();
-
- // this should just be the state for each model
- }
- cerr << ".";
- }
- cerr << "ITERATION " << (iter + 1) << ": PPL=" << exp(loss / numchars) << endl;
- params = H.colPivHouseholderQr().solve(grad);
- cerr << params << endl;
- }
-}
-
-int main(int argc, char** argv) {
-
- std::string tuning_data;
- std::vector<std::string> lms;
-
- try {
- namespace po = boost::program_options;
- po::options_description options("train-params");
-
- options.add_options()
- ("help,h", po::bool_switch(), "Show this help message")
- ("no_bias_term,B", po::bool_switch(), "Do not include a 'bias' feature")
- ("tuning_data,t", po::value<std::string>(&tuning_data), "File to tune perplexity on")
- ("model,m", po::value<std::vector<std::string> >(&lms), "Language models in KenLM format to interpolate");
- po::variables_map vm;
- po::store(po::parse_command_line(argc, argv, options), vm);
-
- // Display help
- if(argc == 1 || vm["help"].as<bool>()) {
- std::cerr << options << std::endl;
- return 1;
- }
- if (vm["no_bias_term"].as<bool>())
- HAS_BIAS = false;
- lms = vm["model"].as<std::vector<std::string> >();
- tuning_data = vm["tuning_data"].as<std::string>();
- }
- catch(const std::exception &e) {
-
- std::cerr << e.what() << std::endl;
- return 1;
-
- }
- if (lms.size() < 2) {
- std::cerr << "Please specify at least two language model files with -m LM.KLM\n";
- return 1;
- }
- if (tuning_data.empty()) {
- std::cerr << "Please specify tuning set with -t FILE.TXT\n";
- return 1;
- }
-
- //Growable vocab here
- //GrowableVocab gvoc(100000); //dummy default
-
- std::map<std::string, int*> vmap;
- util::FixedArray<WordIndex> vm(2);
-
- //stuff it into the
- EnumerateGlobalVocab * globalVocabBuilder = new EnumerateGlobalVocab(&vmap, lms.size());
- // EnumerateGlobalVocab * globalVocabBuilder = new EnumerateGlobalVocab(vm);
-
- Config cfg;
- cfg.enumerate_vocab = (EnumerateVocab *) globalVocabBuilder;
-
- //load models
- //util::FixedArray<Model *> models(lms.size());
- std::vector<Model *> models;
- for(int i=0; i < lms.size(); i++) {
- std::cerr << "Loading LM file: " << lms[i] << std::endl;
-
- //haaaack
- globalVocabBuilder->SetCurModel(i); //yes this is dumb
-
- //models[i] = new Model(lms[i].c_str());
- Model * this_model = new Model(lms[i].c_str(), cfg);
- models.push_back( this_model );
-
- }
-
- //assemble vocabulary vector
- std::vector<std::string> vocab;
- std::cerr << "Global Vocab Map has size: " << vmap.size() << std::endl;
-
- std::pair<StringPiece,int *> me;
-
- for(std::map<std::string, int*>::iterator iter = vmap.begin(); iter != vmap.end(); ++iter) {
- vocab.push_back(iter->first);
- }
- std::cerr << "Vocab vector has size: " << vocab.size() << std::endl;
-
- //load context sorted ngrams into vector of vectors
- std::vector<std::vector<std::string> > corpus;
-
- std::cerr << "Loading context-sorted ngrams: " << tuning_data << std::endl;
- std::ifstream infile(tuning_data);
-
- for(std::string line; std::getline(infile, line); ) {
-
- std::vector<std::string> words; {
-
- std::stringstream stream(line);
- std::string word;
-
- while(stream >> word) {
- words.push_back(word);
- }
- }
- corpus.push_back(words);
- }
-
- train_params_fast(corpus, vocab, models);
-
- return 0;
-}
-
-
-
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_derivatives.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/tune_derivatives.cc b/ext/kenlm/lm/interpolate/tune_derivatives.cc
deleted file mode 100644
index c20e637..0000000
--- a/ext/kenlm/lm/interpolate/tune_derivatives.cc
+++ /dev/null
@@ -1,91 +0,0 @@
-#include "lm/interpolate/tune_derivatives.hh"
-
-namespace lm { namespace interpolate {
-
-ComputeDerivative::ComputeDerivative(const util::FixedArray<Instance> &instances, const Matrix &ln_unigrams, WordIndex bos)
- : instances_(instances), ln_unigrams_(ln_unigrams), bos_(bos) {
- neg_correct_summed_ = Vector::Zero(ln_unigrams.cols());
- for (const Instance *i = instances.begin(); i != instances.end(); ++i) {
- neg_correct_summed_ -= i->ln_correct;
- }
-}
-
-Accum ComputeDerivative::Iteration(const Vector &weights, Vector &gradient, Matrix &hessian) {
- gradient = neg_correct_summed_;
- hessian = Matrix::Zero(weights.rows(), weights.rows());
-
- // TODO: loop instead to force low-memory evaluation
- // Compute p_I(x).
- Vector interp_uni((ln_unigrams_ * weights).array().exp());
- // Even -inf doesn't work for <s> because weights can be negative. Manually set it to zero.
- interp_uni(bos_) = 0.0;
- Accum Z_epsilon = interp_uni.sum();
- interp_uni /= Z_epsilon;
- // unigram_cross(i) = \sum_{all x} p_I(x) ln p_i(x)
- Vector unigram_cross(ln_unigrams_.transpose() * interp_uni);
-
- Accum sum_B_I = 0.0;
- Accum sum_ln_Z_context = 0.0;
-
- Vector weighted_extensions;
- Matrix convolve;
- Vector full_cross;
-
- for (const Instance *n = instances_.begin(); n != instances_.end(); ++n) {
- Accum ln_weighted_backoffs = n->ln_backoff.dot(weights);
- Accum weighted_backoffs = exp(ln_weighted_backoffs);
-
- // Compute \sum_{x: model does not backoff to unigram} p_I(x)
- Accum sum_x_p_I = 0.0;
- for (std::vector<WordIndex>::const_iterator x = n->extension_words.begin(); x != n->extension_words.end(); ++x) {
- sum_x_p_I += interp_uni(*x);
- }
- weighted_extensions = (n->ln_extensions * weights).array().exp();
- Accum Z_context = Z_epsilon * weighted_backoffs * (1.0 - sum_x_p_I) + weighted_extensions.sum();
- sum_ln_Z_context += log(Z_context);
-
- Accum B_I = Z_epsilon / Z_context * weighted_backoffs;
- sum_B_I += B_I;
-
- // This is the gradient term for this instance except for -log p_i(w_n | w_1^{n-1}) which was accounted for as part of neg_correct_sum_.
- // full_cross(i) is \sum_{all x} p_I(x | context) log p_i(x | context)
- full_cross =
- // Uncorrected term
- B_I * (n->ln_backoff + unigram_cross)
- // Correction term: add correct values
- + n->ln_extensions.transpose() * weighted_extensions / Z_context
- // Subtract values that should not have been charged.
- - sum_x_p_I * B_I * n->ln_backoff;
- for (std::vector<WordIndex>::const_iterator x = n->extension_words.begin(); x != n->extension_words.end(); ++x) {
- full_cross.noalias() -= interp_uni(*x) * B_I * ln_unigrams_.row(*x);
- }
-
- gradient += full_cross;
-
- convolve = unigram_cross * n->ln_backoff.transpose();
- // There's one missing term here, which is independent of context and done at the end.
- hessian.noalias() +=
- // First term of Hessian, assuming all models back off to unigram.
- B_I * (convolve + convolve.transpose() + n->ln_backoff * n->ln_backoff.transpose())
- // Second term of Hessian, with correct full probabilities.
- - full_cross * full_cross.transpose();
-
- // Adjust the first term of the Hessian to account for extension
- for (std::size_t x = 0; x < n->extension_words.size(); ++x) {
- WordIndex universal_x = n->extension_words[x];
- hessian.noalias() +=
- // Replacement terms.
- weighted_extensions(x) / Z_context * n->ln_extensions.row(x).transpose() * n->ln_extensions.row(x)
- // Presumed unigrams. TODO: individual terms with backoffs pulled out? Maybe faster?
- - interp_uni(universal_x) * B_I * (ln_unigrams_.row(universal_x).transpose() + n->ln_backoff) * (ln_unigrams_.row(universal_x) + n->ln_backoff.transpose());
- }
- }
-
- for (Matrix::Index x = 0; x < interp_uni.rows(); ++x) {
- // \sum_{contexts} B_I(context) \sum_x p_I(x) log p_i(x) log p_j(x)
- hessian.noalias() += sum_B_I * interp_uni(x) * ln_unigrams_.row(x).transpose() * ln_unigrams_.row(x);
- }
- return exp((neg_correct_summed_.dot(weights) + sum_ln_Z_context) / static_cast<double>(instances_.size()));
-}
-
-}} // namespaces
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_derivatives.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/tune_derivatives.hh b/ext/kenlm/lm/interpolate/tune_derivatives.hh
deleted file mode 100644
index 40c058e..0000000
--- a/ext/kenlm/lm/interpolate/tune_derivatives.hh
+++ /dev/null
@@ -1,30 +0,0 @@
-#ifndef LM_INTERPOLATE_TUNE_DERIVATIVES_H
-#define LM_INTERPOLATE_TUNE_DERIVATIVES_H
-
-#include "lm/interpolate/tune_instance.hh"
-
-#include <Eigen/Core>
-#include <cmath>
-
-namespace lm { namespace interpolate {
-
-class ComputeDerivative {
- public:
- explicit ComputeDerivative(const util::FixedArray<Instance> &instances, const Matrix &ln_unigrams, WordIndex bos);
-
- Accum Iteration(const Vector &weights, Vector &gradient, Matrix &hessian);
-
- private:
- const util::FixedArray<Instance> &instances_;
- const Matrix &ln_unigrams_;
-
- const WordIndex bos_;
-
- // neg_correct_summed_(i) = -\sum_n ln p_i(w_n | w_1^{n-1})
- Vector neg_correct_summed_;
-};
-
-}} // namespaces
-
-#endif // LM_INTERPOLATE_TUNE_DERIVATIVES_H
-
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_derivatives_test.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/tune_derivatives_test.cc b/ext/kenlm/lm/interpolate/tune_derivatives_test.cc
deleted file mode 100644
index 75c0d12..0000000
--- a/ext/kenlm/lm/interpolate/tune_derivatives_test.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-#include "lm/interpolate/tune_derivatives.hh"
-
-#include "lm/interpolate/tune_instance.hh"
-
-#define BOOST_TEST_MODULE DerivativeTest
-#include <boost/test/unit_test.hpp>
-
-namespace lm { namespace interpolate { namespace {
-
-BOOST_AUTO_TEST_CASE(Small) {
- // Three vocabulary words plus <s>, two models.
- Matrix unigrams(4, 2);
- unigrams <<
- 0.1, 0.6,
- 0.4, 0.3,
- 0.5, 0.1,
- // <s>
- 1.0, 1.0;
- unigrams = unigrams.array().log();
-
- // One instance
- util::FixedArray<Instance> instances(1);
- instances.push_back(2);
- Instance &instance = instances.back();
-
- instance.ln_backoff << 0.2, 0.4;
- instance.ln_backoff = instance.ln_backoff.array().log();
-
- // Sparse cases: model 0 word 2 and model 1 word 1.
-
- // Assuming that model 1 only matches word 1, this is p_1(1 | context)
- Accum model_1_word_1 = 1.0 - .6 * .4 - .1 * .4;
-
- // We'll suppose correct has WordIndex 1, which backs off in model 0, and matches in model 1
- instance.ln_correct << (0.4 * 0.2), model_1_word_1;
- instance.ln_correct = instance.ln_correct.array().log();
-
- Accum model_0_word_2 = 1.0 - .1 * .2 - .4 * .2;
-
- instance.extension_words.push_back(1);
- instance.extension_words.push_back(2);
- instance.ln_extensions.resize(2, 2);
- instance.ln_extensions <<
- (0.4 * 0.2), model_1_word_1,
- model_0_word_2, 0.1 * 0.4;
- instance.ln_extensions = instance.ln_extensions.array().log();
-
- ComputeDerivative compute(instances, unigrams, 3);
- Vector weights(2);
- weights << 0.9, 1.2;
-
- Vector gradient(2);
- Matrix hessian(2,2);
- compute.Iteration(weights, gradient, hessian);
-
- // p_I(x | context)
- Vector p_I(3);
- p_I <<
- pow(0.1 * 0.2, 0.9) * pow(0.6 * 0.4, 1.2),
- pow(0.4 * 0.2, 0.9) * pow(model_1_word_1, 1.2),
- pow(model_0_word_2, 0.9) * pow(0.1 * 0.4, 1.2);
- p_I /= p_I.sum();
-
- Vector expected_gradient = -instance.ln_correct;
- expected_gradient(0) += p_I(0) * log(0.1 * 0.2);
- expected_gradient(0) += p_I(1) * log(0.4 * 0.2);
- expected_gradient(0) += p_I(2) * log(model_0_word_2);
- BOOST_CHECK_CLOSE(expected_gradient(0), gradient(0), 0.01);
-
- expected_gradient(1) += p_I(0) * log(0.6 * 0.4);
- expected_gradient(1) += p_I(1) * log(model_1_word_1);
- expected_gradient(1) += p_I(2) * log(0.1 * 0.4);
- BOOST_CHECK_CLOSE(expected_gradient(1), gradient(1), 0.01);
-
- Matrix expected_hessian(2, 2);
- expected_hessian(1, 0) =
- // First term
- p_I(0) * log(0.1 * 0.2) * log(0.6 * 0.4) +
- p_I(1) * log(0.4 * 0.2) * log(model_1_word_1) +
- p_I(2) * log(model_0_word_2) * log(0.1 * 0.4);
- expected_hessian(1, 0) -=
- (p_I(0) * log(0.1 * 0.2) + p_I(1) * log(0.4 * 0.2) + p_I(2) * log(model_0_word_2)) *
- (p_I(0) * log(0.6 * 0.4) + p_I(1) * log(model_1_word_1) + p_I(2) * log(0.1 * 0.4));
- expected_hessian(0, 1) = expected_hessian(1, 0);
- BOOST_CHECK_CLOSE(expected_hessian(1, 0), hessian(1, 0), 0.01);
- BOOST_CHECK_CLOSE(expected_hessian(0, 1), hessian(0, 1), 0.01);
-}
-
-}}} // namespaces
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_instance.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/tune_instance.cc b/ext/kenlm/lm/interpolate/tune_instance.cc
deleted file mode 100644
index f1c9924..0000000
--- a/ext/kenlm/lm/interpolate/tune_instance.cc
+++ /dev/null
@@ -1,354 +0,0 @@
-#include "lm/interpolate/tune_instance.hh"
-
-#include "lm/common/model_buffer.hh"
-#include "lm/common/ngram_stream.hh"
-#include "lm/common/renumber.hh"
-#include "lm/enumerate_vocab.hh"
-#include "lm/interpolate/merge_vocab.hh"
-#include "lm/interpolate/universal_vocab.hh"
-#include "lm/lm_exception.hh"
-#include "util/file_piece.hh"
-#include "util/murmur_hash.hh"
-#include "util/stream/chain.hh"
-#include "util/tokenize_piece.hh"
-
-#include <boost/unordered_map.hpp>
-
-#include <cmath>
-#include <limits>
-#include <vector>
-
-namespace lm { namespace interpolate {
-
-// An extension without backoff weights applied yet.
-#pragma pack(push)
-#pragma pack(1)
-struct InitialExtension {
- Extension ext;
- // Order from which it came.
- uint8_t order;
-};
-#pragma pack(pop)
-
-// Intended use
-// For each model:
-// stream through orders jointly in suffix order:
-// Call MatchedBackoff for full matches.
-// Call Exit when the context matches.
-// Call FinishModel with the unigram probability of the correct word, get full
-// probability in return.
-// Use Backoffs to adjust records that were written to the stream.
-class InstanceMatch {
- public:
- InstanceMatch(ModelIndex models, uint8_t max_order, const WordIndex correct)
- : seen_(std::numeric_limits<WordIndex>::max()),
- backoffs_(Matrix::Zeros(models, max_order)),
- correct_(correct), correct_from_(1), correct_ln_prob_(std::numeric_limits<float>::quiet_NaN()) {}
-
- void MatchedBackoff(ModelIndex model, uint8_t order, float ln_backoff) {
- backoffs_(model, order - 1) = ln_backoff;
- }
-
- // We only want the highest-order matches, which are the first to be exited for a given word.
- void Exit(const InitialExtension &from, util::stream::Stream &out) {
- if (from.ext.word == seen_) return;
- seen_ = from.ext.word;
- *static_cast<InitialExtension*>(out.Get()) = from;
- ++out;
- if (UTIL_UNLIKELY(correct_ == from.ext.word)) {
- correct_from_ = from.order;
- correct_ln_prob_ = from.ext.ln_prob;
- }
- }
-
- WordIndex Correct() const { return correct_; }
-
- // Call this after each model has been passed through. The
- float FinishModel(ModelIndex model, float correct_ln_unigram) {
- seen_ = std::numeric_limits<WordIndex>::max();
- // Turn backoffs into multiplied values (added in log space).
- // So backoffs_(model, order - 1) is the penalty for matching order.
- float accum = 0.0;
- for (int order = backoffs_.cols() - 1; order >= 0; --order) {
- accum += backoffs_(model, order);
- backoffs_(model, order) = accum;
- }
- if (correct_from_ == 1) {
- correct_ln_prob_ = correct_ln_unigram;
- }
- if (correct_from_ - 1 < backoffs_.cols()) {
- correct_ln_prob_ += backoffs_(model, correct_from_ - 1);
- }
- correct_from_ = 1;
- return correct_ln_prob_;
- }
-
- const Matrix &Backoffs() const {
- return backoffs_;
- }
-
- private:
- // What's the last word we've seen? Used to act only on exiting the longest match.
- WordIndex seen_;
-
- Matrix backoffs_;
-
- const WordIndex correct_;
-
- // These only apply to the most recent model.
- uint8_t correct_from_;
-
- float correct_ln_prob_;
-};
-
-namespace {
-
-// Forward information to multiple instances of a context.
-class DispatchContext {
- public:
- void Register(InstanceMatch &context) {
- registered_.push_back(&context);
- }
-
- void MatchedBackoff(uint8_t order, float ln_backoff) {
- for (std::vector<InstanceMatch*>::iterator i = registered_.begin(); i != registered_.end(); ++i)
- (*i)->MatchedBackoff(order, ln_backoff);
- }
-
- void Exit(const InitialExtension &from, util::stream::Stream &out) {
- for (std::vector<InstanceMatch*>::iterator i = registered_.begin(); i != registered_.end(); ++i) {
- (*i)->Exit(from, out);
- }
- }
-
- private:
- std::vector<InstanceMatch*> registered_;
-};
-
-// Map from n-gram hash to contexts in the tuning data.
-typedef boost::unordered_map<uint64_t, DispatchContext> ContextMap;
-
-class ApplyBackoffs {
- public:
- explicit ApplyBackoffs(const InstanceMatch *backoffs) : backoffs_(backoffs) {}
-
- void Run(const util::stream::ChainPosition &position) {
- for (util::stream::Stream stream(position); stream; ++stream) {
- InitialExtension &ini = *reinterpret_cast<InitialExtension*>(stream.Get());
- ini.ext.ln_prob += backoffs_[ini.ext.instance]
- }
- }
-
- private:
- const InstanceMatch *backoffs_;
-};
-
-Instances::ReadExtensions(util::stream::Chain &on) {
- if (extensions_first_.get()) {
- // Lazy sort and save a sorted copy to disk. TODO: cut down on record size by stripping out order information?
- extensions_first_->Output(on);
- extensions_first_->reset();
- // TODO: apply backoff data!!!!
-
- extensions_subsequent_.reset(new util::stream::FileBuffer(util::MakeTemp(sorting_config_.temp_prefix)));
- on >> extensions_subsequent_->Sink();
- } else {
- on >> extensions_subsequent_->Source();
- }
-}
-
-class UnigramLoader {
- public:
- UnigramLoader(ContextMap &contexts_for_backoffs, Matrix &ln_probs, std::size_t model_number)
- : map_(contexts_for_backoffs),
- prob_(ln_probs.col(model_number)) {}
-
- void Run(const util::stream::ChainPosition &position) {
- // TODO handle the case of a unigram model?
- NGramStream<ProbBackoff> input(position);
- assert(input);
- Accum unk = input->Value().prob * M_LN10;
- WordIndex previous = 0;
- for (; input; ++input) {
- WordIndex word = *input->begin();
- prob_.segment(previous, word - previous) = Vector::Constant(word - previous, unk);
- prob_(word) = input->Value().prob * M_LN10;
- ContextMap::iterator i = map_.find(util::MurmurHashNative(input->begin(), sizeof(WordIndex)));
- if (i != map_.end()) {
- i->second.MatchedBackoff(1, input->Value().backoff * M_LN10);
- }
- previous = word + 1;
- }
- prob_.segment(previous, prob_.rows() - previous) = Vector::Constant(prob_.rows() - previous, unk);
- }
-
- private:
- ContextMap &map_;
- Matrix::ColXpr prob_;
- std::size_t model_;
-};
-
-class MiddleLoader {
- public:
- explicit MiddleLoader(ContextMap &map)
- : map_(map) {}
-
- void Run(const util::stream::ChainPosition &position) {
- NGramStream<ProbBackoff> input(position);
- const std::size_t full_size = (uint8_t*)input->end() - (uint8_t*)input->begin();
- const std::size_t context_size = full_size - sizeof(WordIndex);
- ContextMap::iterator i;
- for (; input; ++input) {
- i = map_.find(util::MurmurHashNative(input->begin(), full_size));
- if (i != map_.end()) {
- i->second.MatchedBackoff(input->Order(), input->Value().backoff * M_LN10);
- }
- i = map_.find(util::MurmurHashNative(input->begin(), context_size));
- if (i != map_.end()) {
- i->second.MatchedContext(input->Order(), *(input->end() - 1), input->Value().prob * M_LN10);
- }
- }
- }
-
- private:
- ContextMap &map_;
-};
-
-class HighestLoader {
- public:
- HighestLoader(ContextMap &map, uint8_t order)
- : map_(map), order_(order) {}
-
- void Run(const util::stream::ChainPosition &position) {
- ContextMap::iterator i;
- const std::size_t context_size = sizeof(WordIndex) * (order_ - 1);
- for (ProxyStream<NGram<float> > input(position, NGram<float>(NULL, order_)); input; ++input) {
- i = map_.find(util::MurmurHashNative(input->begin(), context_size));
- if (i != map_.end()) {
- i->second.MatchedContext(order_, *(input->end() - 1), input->Value() * M_LN10);
- }
- }
- }
-
- private:
- ContextMap &map_;
- const uint8_t order_;
-};
-
-class IdentifyTuning : public EnumerateVocab {
- public:
- IdentifyTuning(int tuning_file, std::vector<WordIndex> &out) : indices_(out) {
- indices_.clear();
- StringPiece line;
- std::size_t counter = 0;
- std::vector<std::size_t> &eos = words_[util::MurmurHashNative("</s>", 4)];
- for (util::FilePiece f(tuning_file); f.ReadLineOrEOF(line);) {
- for (util::TokenIter<util::BoolCharacter, true> word(line, util::kSpaces); word; ++word) {
- UTIL_THROW_IF(*word == "<s>" || *word == "</s>", FormatLoadException, "Illegal word in tuning data: " << *word);
- words_[util::MurmurHashNative(word->data(), word->size())].push_back(counter++);
- }
- eos.push_back(counter++);
- }
- // Also get <s>
- indices_.resize(counter + 1);
- words_[util::MurmurHashNative("<s>", 3)].push_back(indices_.size() - 1);
- }
-
- void Add(WordIndex id, const StringPiece &str) {
- boost::unordered_map<uint64_t, std::vector<std::size_t> >::iterator i = words_.find(util::MurmurHashNative(str.data(), str.size()));
- if (i != words_.end()) {
- for (std::vector<std::size_t>::iterator j = i->second.begin(); j != i->second.end(); ++j) {
- indices_[*j] = id;
- }
- }
- }
-
- WordIndex FinishGetBOS() {
- WordIndex ret = indices_.back();
- indices_.pop_back();
- return ret;
- }
-
- private:
- std::vector<WordIndex> &indices_;
-
- boost::unordered_map<uint64_t, std::vector<std::size_t> > words_;
-};
-
-} // namespace
-
-Instance::Instance(std::size_t num_models) : ln_backoff(num_models), ln_correct(num_models), ln_extensions(0, num_models) {}
-
-WordIndex LoadInstances(int tuning_file, const std::vector<StringPiece> &model_names, util::FixedArray<Instance> &instances, Matrix &ln_unigrams) {
- util::FixedArray<ModelBuffer> models(model_names.size());
- std::vector<WordIndex> vocab_sizes;
- vocab_sizes.reserve(model_names.size());
- util::FixedArray<util::scoped_fd> vocab_files(model_names.size());
- std::size_t max_order = 0;
- for (std::vector<StringPiece>::const_iterator i = model_names.begin(); i != model_names.end(); ++i) {
- models.push_back(*i);
- vocab_sizes.push_back(models.back().Counts()[0]);
- vocab_files.push_back(models.back().StealVocabFile());
- max_order = std::max(max_order, models.back().Order());
- }
- UniversalVocab vocab(vocab_sizes);
- std::vector<WordIndex> tuning_words;
- WordIndex bos;
- WordIndex combined_vocab_size;
- {
- IdentifyTuning identify(tuning_file, tuning_words);
- combined_vocab_size = MergeVocab(vocab_files, vocab, identify);
- bos = identify.FinishGetBOS();
- }
-
- instances.Init(tuning_words.size());
- util::FixedArray<InstanceBuilder> builders(tuning_words.size());
- std::vector<WordIndex> context;
- context.push_back(bos);
-
- // Populate the map from contexts to instance builders.
- ContextMap cmap;
- const WordIndex eos = tuning_words.back();
- for (std::size_t i = 0; i < tuning_words.size(); ++i) {
- instances.push_back(model_names.size());
- builders.push_back(tuning_words[i], max_order);
- for (std::size_t j = 0; j < context.size(); ++j) {
- cmap[util::MurmurHashNative(&context[j], sizeof(WordIndex) * (context.size() - j))].Register(builders.back());
- }
- // Prepare for next word.
- if (tuning_words[i] == eos) {
- context.clear();
- context.push_back(bos);
- } else {
- if (context.size() == max_order) {
- context.erase(context.begin());
- }
- context.push_back(tuning_words[i]);
- }
- }
-
- ln_unigrams.resize(combined_vocab_size, models.size());
-
- // Scan through input files. Sadly not parallel due to an underlying hash table.
- for (std::size_t m = 0; m < models.size(); ++m) {
- for (std::size_t order = 1; order <= models[m].Order(); ++order) {
- util::stream::Chain chain(util::stream::ChainConfig(sizeof(ProbBackoff) + order * sizeof(WordIndex), 2, 64 * 1048576));
- models[m].Source(order - 1, chain);
- chain >> Renumber(vocab.Mapping(m), order);
- if (order == 1) {
- chain >> UnigramLoader(cmap, ln_unigrams, m);
- } else if (order < models[m].Order()) {
- chain >> MiddleLoader(cmap);
- } else {
- chain >> HighestLoader(cmap, order);
- }
- }
- for (std::size_t instance = 0; instance < tuning_words.size(); ++instance) {
- builders[instance].Dump(m, ln_unigrams, instances[instance]);
- }
- ln_unigrams(bos, m) = -99; // Does not matter as long as it does not produce nans since tune_derivatives sets this to zero.
- }
- return bos;
-}
-
-}} // namespaces
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_instance.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/tune_instance.hh b/ext/kenlm/lm/interpolate/tune_instance.hh
deleted file mode 100644
index c11eec9..0000000
--- a/ext/kenlm/lm/interpolate/tune_instance.hh
+++ /dev/null
@@ -1,86 +0,0 @@
-#ifndef LM_INTERPOLATE_TUNE_INSTANCE_H
-#define LM_INTERPOLATE_TUNE_INSTANCE_H
-
-#include "lm/interpolate/tune_matrix.hh"
-#include "lm/word_index.hh"
-#include "util/scoped.hh"
-#include "util/stream/config.hh"
-#include "util/string_piece.hh"
-
-#include <boost/optional.hpp>
-
-#include <vector>
-
-namespace util { namespace stream {
-template <class S, class T> class Sort;
-class Chain;
-class FileBuffer;
-}} // namespaces
-
-namespace lm { namespace interpolate {
-
-typedef uint32_t InstanceIndex;
-typedef uint32_t ModelIndex;
-
-struct Extension {
- // Which tuning instance does this belong to?
- InstanceIndex instance;
- WordIndex word;
- ModelIndex model;
- // ln p_{model} (word | context(instance))
- float ln_prob;
-
- bool operator<(const Extension &other) const {
- if (instance != other.instance)
- return instance < other.instance;
- if (word != other.word)
- return word < other.word;
- if (model != other.model)
- return model < other.model;
- return false;
- }
-};
-
-class Instances {
- public:
- Instances(int tune_file, const std::vector<StringPiece> &model_names);
-
- Eigen::ConstRowXpr Backoffs(InstanceIndex instance) const {
- return ln_backoffs_.row(instance);
- }
-
- const Vector &CorrectGradientTerm() const { return neg_ln_correct_sum_; }
-
- const Matrix &LNUnigrams() const { return ln_unigrams_; }
-
- void ReadExtensions(util::stream::Chain &to);
-
- private:
- // backoffs_(instance, model) is the backoff all the way to unigrams.
- typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> BackoffMatrix;
- BackoffMatrix ln_backoffs_;
-
- // neg_correct_sum_(model) = -\sum_{instances} ln p_{model}(correct(instance) | context(instance)).
- // This appears as a term in the gradient.
- Vector neg_ln_correct_sum_;
-
- // unigrams_(word, model) = ln p_{model}(word).
- Matrix ln_unigrams_;
-
- struct ExtensionCompare {
- bool operator()(const void *f, const void *s) const {
- return reinterpret_cast<const Extension &>(f) < reinterpret_cast<const Extension &>(s);
- }
- };
-
- // This is the source of data for the first iteration.
- util::scoped_ptr<util::stream::Sort<ExtensionCompare> > extensions_first_;
-
- // Source of data for subsequent iterations. This contains already-sorted data.
- util::scoped_ptr<util::stream::FileBuffer> extensions_subsequent_;
-
- const util::stream::SortConfig sorting_config_;
-};
-
-}} // namespaces
-#endif // LM_INTERPOLATE_TUNE_INSTANCE_H
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_instance_data/generate.sh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/tune_instance_data/generate.sh b/ext/kenlm/lm/interpolate/tune_instance_data/generate.sh
deleted file mode 100755
index d725572..0000000
--- a/ext/kenlm/lm/interpolate/tune_instance_data/generate.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-../../../bin/lmplz --discount_fallback -o 3 -S 100M --intermediate toy0 --arpa toy0.arpa <<EOF
-a a b a
-b a a b
-EOF
-../../../bin/lmplz --discount_fallback -o 3 -S 100M --intermediate toy1 --arpa toy1.arpa <<EOF
-a a b b b b b b b
-c
-EOF
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_instance_data/toy0.1
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/tune_instance_data/toy0.1 b/ext/kenlm/lm/interpolate/tune_instance_data/toy0.1
deleted file mode 100644
index 1b66c51..0000000
Binary files a/ext/kenlm/lm/interpolate/tune_instance_data/toy0.1 and /dev/null differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_instance_data/toy0.2
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/tune_instance_data/toy0.2 b/ext/kenlm/lm/interpolate/tune_instance_data/toy0.2
deleted file mode 100644
index d735b1c..0000000
Binary files a/ext/kenlm/lm/interpolate/tune_instance_data/toy0.2 and /dev/null differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_instance_data/toy0.3
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/tune_instance_data/toy0.3 b/ext/kenlm/lm/interpolate/tune_instance_data/toy0.3
deleted file mode 100644
index 2d97aa3..0000000
Binary files a/ext/kenlm/lm/interpolate/tune_instance_data/toy0.3 and /dev/null differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_instance_data/toy0.kenlm_intermediate
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/tune_instance_data/toy0.kenlm_intermediate b/ext/kenlm/lm/interpolate/tune_instance_data/toy0.kenlm_intermediate
deleted file mode 100644
index 8513475..0000000
--- a/ext/kenlm/lm/interpolate/tune_instance_data/toy0.kenlm_intermediate
+++ /dev/null
@@ -1,3 +0,0 @@
-KenLM intermediate binary file
-Counts 5 7 7
-Payload pb
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_instance_data/toy0.vocab
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/tune_instance_data/toy0.vocab b/ext/kenlm/lm/interpolate/tune_instance_data/toy0.vocab
deleted file mode 100644
index 520c0f9..0000000
Binary files a/ext/kenlm/lm/interpolate/tune_instance_data/toy0.vocab and /dev/null differ
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/lm/interpolate/tune_instance_data/toy1.1
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/lm/interpolate/tune_instance_data/toy1.1 b/ext/kenlm/lm/interpolate/tune_instance_data/toy1.1
deleted file mode 100644
index a50cec6..0000000
Binary files a/ext/kenlm/lm/interpolate/tune_instance_data/toy1.1 and /dev/null differ