You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/19 21:33:53 UTC
[05/51] [partial] incubator-joshua git commit: Converted KenLM into a
submodule
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/python/kenlm.pyx
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/python/kenlm.pyx b/ext/kenlm/python/kenlm.pyx
deleted file mode 100644
index f312c90..0000000
--- a/ext/kenlm/python/kenlm.pyx
+++ /dev/null
@@ -1,231 +0,0 @@
-import os
-cimport _kenlm
-
-cdef bytes as_str(data):
- if isinstance(data, bytes):
- return data
- elif isinstance(data, unicode):
- return data.encode('utf8')
- raise TypeError('Cannot convert %s to string' % type(data))
-
-cdef class FullScoreReturn:
- """
- Wrapper around FullScoreReturn.
-
- Notes:
- `prob` has been renamed to `log_prob`
- `oov` has been added to flag whether the word is OOV
- """
-
- cdef float log_prob
- cdef int ngram_length
- cdef bint oov
-
- def __cinit__(self, log_prob, ngram_length, oov):
- self.log_prob = log_prob
- self.ngram_length = ngram_length
- self.oov = oov
-
- def __repr__(self):
- return '{0}({1}, {2}, {3})'.format(self.__class__.__name__, repr(self.log_prob), repr(self.ngram_length), repr(self.oov))
-
- property log_prob:
- def __get__(self):
- return self.log_prob
-
- property ngram_length:
- def __get__(self):
- return self.ngram_length
-
- property oov:
- def __get__(self):
- return self.oov
-
-cdef class State:
- """
- Wrapper around lm::ngram::State so that python code can make incremental queries.
-
- Notes:
- * rich comparisons
- * hashable
- """
-
- cdef _kenlm.State _c_state
-
- def __richcmp__(State qa, State qb, int op):
- r = qa._c_state.Compare(qb._c_state)
- if op == 0: # <
- return r < 0
- elif op == 1: # <=
- return r <= 0
- elif op == 2: # ==
- return r == 0
- elif op == 3: # !=
- return r != 0
- elif op == 4: # >
- return r > 0
- else: # >=
- return r >= 0
-
- def __hash__(self):
- return _kenlm.hash_value(self._c_state)
-
-
-cdef class LanguageModel:
- """
- This is not a strict wrapper, the interface is more pythonic.
- It loads models and queries full sentences.
- """
-
- cdef _kenlm.Model* model
- cdef public bytes path
- cdef _kenlm.const_Vocabulary* vocab
-
- def __init__(self, path):
- """
- Load the language model.
-
- :param path: path to an arpa file or a kenlm binary file.
- """
- self.path = os.path.abspath(as_str(path))
- try:
- self.model = _kenlm.LoadVirtual(self.path)
- except RuntimeError as exception:
- exception_message = str(exception).replace('\n', ' ')
- raise IOError('Cannot read model \'{}\' ({})'.format(path, exception_message))\
- from exception
- self.vocab = &self.model.BaseVocabulary()
-
- def __dealloc__(self):
- del self.model
-
- property order:
- def __get__(self):
- return self.model.Order()
-
- def score(self, sentence, bos = True, eos = True):
- cdef list words = as_str(sentence).split()
- cdef _kenlm.State state
- if bos:
- self.model.BeginSentenceWrite(&state)
- else:
- self.model.NullContextWrite(&state)
- cdef _kenlm.State out_state
- cdef float total = 0
- for word in words:
- total += self.model.BaseScore(&state, self.vocab.Index(word), &out_state)
- state = out_state
- if eos:
- total += self.model.BaseScore(&state, self.vocab.EndSentence(), &out_state)
- return total
-
- def full_scores(self, sentence, bos = True, eos = True):
- """
- full_scores(sentence, bos = True, eos = Ture) -> generate full scores (prob, ngram length, oov)
- @param sentence is a string (do not use boundary symbols)
- @param bos should kenlm add a bos state
- @param eos should kenlm add an eos state
- """
- cdef list words = as_str(sentence).split()
- cdef _kenlm.State state
- if bos:
- self.model.BeginSentenceWrite(&state)
- else:
- self.model.NullContextWrite(&state)
- cdef _kenlm.State out_state
- cdef _kenlm.FullScoreReturn ret
- cdef float total = 0
- cdef _kenlm.WordIndex wid
- for word in words:
- wid = self.vocab.Index(word)
- ret = self.model.BaseFullScore(&state, wid, &out_state)
- yield (ret.prob, ret.ngram_length, wid == 0)
- state = out_state
- if eos:
- ret = self.model.BaseFullScore(&state,
- self.vocab.EndSentence(), &out_state)
- yield (ret.prob, ret.ngram_length, False)
-
- def __contains__(self, word):
- cdef bytes w = as_str(word)
- return (self.vocab.Index(w) != 0)
-
- def __repr__(self):
- return '<LanguageModel from {0}>'.format(os.path.basename(self.path))
-
- def __reduce__(self):
- return (_kenlm.LanguageModel, (self.path,))
-
-cdef class Model:
- """
- This is closer to a wrapper around lm::ngram::Model.
- """
-
- cdef _kenlm.Model* model
- cdef public bytes path
- cdef _kenlm.const_Vocabulary* vocab
-
- def __init__(self, path):
- """
- Load the language model.
-
- :param path: path to an arpa file or a kenlm binary file.
- """
- self.path = os.path.abspath(as_str(path))
- try:
- self.model = _kenlm.LoadVirtual(self.path)
- except RuntimeError as exception:
- exception_message = str(exception).replace('\n', ' ')
- raise IOError('Cannot read model \'{}\' ({})'.format(path, exception_message))\
- from exception
- self.vocab = &self.model.BaseVocabulary()
-
- def __dealloc__(self):
- del self.model
-
- property order:
- def __get__(self):
- return self.model.Order()
-
- def BeginSentenceWrite(self, State state):
- """Change the given state to a BOS state."""
- self.model.BeginSentenceWrite(&state._c_state)
-
- def NullContextWrite(self, State state):
- """Change the given state to a NULL state."""
- self.model.NullContextWrite(&state._c_state)
-
- def BaseScore(self, State in_state, str word, State out_state):
- """
- Return p(word|in_state) and update the output state.
- Wrapper around model.BaseScore(in_state, Index(word), out_state)
-
- :param word: the suffix
- :param state: the context (defaults to NullContext)
- :returns: p(word|state)
- """
- cdef float total = self.model.BaseScore(&in_state._c_state, self.vocab.Index(as_str(word)), &out_state._c_state)
- return total
-
- def BaseFullScore(self, State in_state, str word, State out_state):
- """
- Wrapper around model.BaseScore(in_state, Index(word), out_state)
-
- :param word: the suffix
- :param state: the context (defaults to NullContext)
- :returns: FullScoreReturn(word|state)
- """
- cdef _kenlm.WordIndex wid = self.vocab.Index(as_str(word))
- cdef _kenlm.FullScoreReturn ret = self.model.BaseFullScore(&in_state._c_state, wid, &out_state._c_state)
- return FullScoreReturn(ret.prob, ret.ngram_length, wid == 0)
-
- def __contains__(self, word):
- cdef bytes w = as_str(word)
- return (self.vocab.Index(w) != 0)
-
- def __repr__(self):
- return '<Model from {0}>'.format(os.path.basename(self.path))
-
- def __reduce__(self):
- return (_kenlm.LanguageModel, (self.path,))
-
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/setup.py
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/setup.py b/ext/kenlm/setup.py
deleted file mode 100644
index ee061a8..0000000
--- a/ext/kenlm/setup.py
+++ /dev/null
@@ -1,48 +0,0 @@
-from distutils.core import setup
-from distutils.extension import Extension
-import glob
-import platform
-import os
-
-#Does gcc compile with this header and library?
-def compile_test(header, library):
- dummy_path = os.path.join(os.path.dirname(__file__), "dummy")
- command = "bash -c \"g++ -include " + header + " -l" + library + " -x c++ - <<<'int main() {}' -o " + dummy_path + " >/dev/null 2>/dev/null && rm " + dummy_path + " 2>/dev/null\""
- return os.system(command) == 0
-
-
-FILES = glob.glob('util/*.cc') + glob.glob('lm/*.cc') + glob.glob('util/double-conversion/*.cc')
-FILES = [fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc'))]
-
-LIBS = ['stdc++']
-if platform.system() != 'Darwin':
- LIBS.append('rt')
-
-
-ARGS = ['-O3', '-DNDEBUG', '-DKENLM_MAX_ORDER=6']
-
-if compile_test('zlib.h', 'z'):
- ARGS.append('-DHAVE_ZLIB')
- LIBS.append('z')
-
-if compile_test('bzlib.h', 'bz2'):
- ARGS.append('-DHAVE_BZLIB')
- LIBS.append('bz2')
-
-if compile_test('lzma.h', 'lzma'):
- ARGS.append('-DHAVE_XZLIB')
- LIBS.append('lzma')
-
-ext_modules = [
- Extension(name='kenlm',
- sources=FILES + ['python/kenlm.cpp'],
- language='C++',
- include_dirs=['.'],
- libraries=LIBS,
- extra_compile_args=ARGS)
-]
-
-setup(
- name='kenlm',
- ext_modules=ext_modules
-)
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/util/CMakeLists.txt b/ext/kenlm/util/CMakeLists.txt
deleted file mode 100644
index 8a544aa..0000000
--- a/ext/kenlm/util/CMakeLists.txt
+++ /dev/null
@@ -1,81 +0,0 @@
-cmake_minimum_required(VERSION 2.8.8)
-#
-# The KenLM cmake files make use of add_library(... OBJECTS ...)
-#
-# This syntax allows grouping of source files when compiling
-# (effectively creating "fake" libraries based on source subdirs).
-#
-# This syntax was only added in cmake version 2.8.8
-#
-# see http://www.cmake.org/Wiki/CMake/Tutorials/Object_Library
-
-
-# This CMake file was created by Lane Schwartz <do...@gmail.com>
-
-
-# Explicitly list the source files for this subdirectory
-#
-# If you add any source files to this subdirectory
-# that should be included in the kenlm library,
-# (this excludes any unit test files)
-# you should add them to the following list:
-#
-# Because we do not set PARENT_SCOPE in the following definition,
-# CMake files in the parent directory won't be able to access this variable.
-#
-set(KENLM_UTIL_SOURCE
- bit_packing.cc
- ersatz_progress.cc
- exception.cc
- file.cc
- file_piece.cc
- float_to_string.cc
- integer_to_string.cc
- mmap.cc
- murmur_hash.cc
- parallel_read.cc
- pool.cc
- read_compressed.cc
- scoped.cc
- string_piece.cc
- usage.cc
- )
-
-# This directory has children that need to be processed
-add_subdirectory(double-conversion)
-add_subdirectory(stream)
-
-
-# Group these objects together for later use.
-#
-# Given add_library(foo OBJECT ${my_foo_sources}),
-# refer to these objects as $<TARGET_OBJECTS:foo>
-#
-add_library(kenlm_util OBJECT ${KENLM_UTIL_DOUBLECONVERSION_SOURCE} ${KENLM_UTIL_STREAM_SOURCE} ${KENLM_UTIL_SOURCE})
-
-
-
-# Only compile and run unit tests if tests should be run
-if(BUILD_TESTING)
-
- # Explicitly list the Boost test files to be compiled
- set(KENLM_BOOST_TESTS_LIST
- bit_packing_test
- joint_sort_test
- multi_intersection_test
- probing_hash_table_test
- read_compressed_test
- sorted_uniform_test
- tokenize_piece_test
- )
-
- AddTests(TESTS ${KENLM_BOOST_TESTS_LIST}
- DEPENDS $<TARGET_OBJECTS:kenlm_util>
- LIBRARIES ${Boost_LIBRARIES} pthread)
-
- # file_piece_test requires an extra command line parameter
- KenLMAddTest(TEST file_piece_test
- DEPENDS $<TARGET_OBJECTS:kenlm_util>
- LIBRARIES ${Boost_LIBRARIES} pthread
- TEST_ARGS ${CMAKE_CURRENT_SOURCE_DIR}/file_piece.cc)
-endif()
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/Jamfile
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/util/Jamfile b/ext/kenlm/util/Jamfile
deleted file mode 100644
index 2eeccf4..0000000
--- a/ext/kenlm/util/Jamfile
+++ /dev/null
@@ -1,41 +0,0 @@
-local compressed_flags = <include>.. <define>HAVE_ZLIB ;
-local compressed_deps = /top//z ;
-if [ test_library "bz2" ] && [ test_header "bzlib.h" ] {
- external-lib bz2 ;
- compressed_flags += <define>HAVE_BZLIB ;
- compressed_deps += bz2 ;
-}
-if [ test_library "lzma" ] && [ test_header "lzma.h" ] {
- external-lib lzma ;
- compressed_flags += <define>HAVE_XZLIB ;
- compressed_deps += lzma ;
-}
-
-#rt is needed for clock_gettime on linux. But it's already included with threading=multi
-lib rt ;
-
-obj read_compressed.o : read_compressed.cc : $(compressed_flags) ;
-alias read_compressed : read_compressed.o $(compressed_deps) ;
-obj read_compressed_test.o : read_compressed_test.cc /top//boost_unit_test_framework : $(compressed_flags) ;
-obj file_piece_test.o : file_piece_test.cc /top//boost_unit_test_framework : $(compressed_flags) ;
-
-fakelib parallel_read : parallel_read.cc : <threading>multi:<source>/top//boost_thread <threading>multi:<define>WITH_THREADS : : <include>.. ;
-
-fakelib kenutil : [ glob *.cc : parallel_read.cc read_compressed.cc *_main.cc *_test.cc ] read_compressed parallel_read double-conversion//double-conversion : <include>.. <os>LINUX,<threading>single:<source>rt : : <include>.. ;
-
-exe cat_compressed : cat_compressed_main.cc kenutil ;
-
-#Does not install this
-exe probing_hash_table_benchmark : probing_hash_table_benchmark_main.cc kenutil ;
-
-alias programs : cat_compressed ;
-
-import testing ;
-
-run file_piece_test.o kenutil /top//boost_unit_test_framework : : file_piece.cc ;
-for local t in [ glob *_test.cc : file_piece_test.cc read_compressed_test.cc ] {
- local name = [ MATCH "(.*)\.cc" : $(t) ] ;
- unit-test $(name) : $(t) kenutil /top//boost_unit_test_framework /top//boost_system ;
-}
-
-build-project stream ;
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/bit_packing.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/util/bit_packing.cc b/ext/kenlm/util/bit_packing.cc
deleted file mode 100644
index cffd9cf..0000000
--- a/ext/kenlm/util/bit_packing.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-#include "util/bit_packing.hh"
-#include "util/exception.hh"
-
-#include <cstring>
-
-namespace util {
-
-namespace {
-template <bool> struct StaticCheck {};
-template <> struct StaticCheck<true> { typedef bool StaticAssertionPassed; };
-
-// If your float isn't 4 bytes, we're hosed.
-typedef StaticCheck<sizeof(float) == 4>::StaticAssertionPassed FloatSize;
-
-} // namespace
-
-uint8_t RequiredBits(uint64_t max_value) {
- if (!max_value) return 0;
- uint8_t ret = 1;
- while (max_value >>= 1) ++ret;
- return ret;
-}
-
-void BitPackingSanity() {
- const FloatEnc neg1 = { -1.0 }, pos1 = { 1.0 };
- if ((neg1.i ^ pos1.i) != 0x80000000) UTIL_THROW(Exception, "Sign bit is not 0x80000000");
- char mem[57+8];
- memset(mem, 0, sizeof(mem));
- const uint64_t test57 = 0x123456789abcdefULL;
- for (uint64_t b = 0; b < 57 * 8; b += 57) {
- WriteInt57(mem, b, 57, test57);
- }
- for (uint64_t b = 0; b < 57 * 8; b += 57) {
- if (test57 != ReadInt57(mem, b, 57, (1ULL << 57) - 1))
- UTIL_THROW(Exception, "The bit packing routines are failing for your architecture. Please send a bug report with your architecture, operating system, and compiler.");
- }
- // TODO: more checks.
-}
-
-} // namespace util
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/bit_packing.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/util/bit_packing.hh b/ext/kenlm/util/bit_packing.hh
deleted file mode 100644
index b24fd9c..0000000
--- a/ext/kenlm/util/bit_packing.hh
+++ /dev/null
@@ -1,185 +0,0 @@
-#ifndef UTIL_BIT_PACKING_H
-#define UTIL_BIT_PACKING_H
-
-/* Bit-level packing routines
- *
- * WARNING WARNING WARNING:
- * The write functions assume that memory is zero initially. This makes them
- * faster and is the appropriate case for mmapped language model construction.
- * These routines assume that unaligned access to uint64_t is fast. This is
- * the case on x86_64. I'm not sure how fast unaligned 64-bit access is on
- * x86 but my target audience is large language models for which 64-bit is
- * necessary.
- *
- * Call the BitPackingSanity function to sanity check. Calling once suffices,
- * but it may be called multiple times when that's inconvenient.
- *
- * ARM and MinGW ports contributed by Hideo Okuma and Tomoyuki Yoshimura at
- * NICT.
- */
-
-#include <cassert>
-#ifdef __APPLE__
-#include <architecture/byte_order.h>
-#elif __linux__
-#include <endian.h>
-#elif !defined(_WIN32) && !defined(_WIN64)
-#include <arpa/nameser_compat.h>
-#endif
-
-#include <stdint.h>
-#include <cstring>
-
-namespace util {
-
-// Fun fact: __BYTE_ORDER is wrong on Solaris Sparc, but the version without __ is correct.
-#if BYTE_ORDER == LITTLE_ENDIAN
-inline uint8_t BitPackShift(uint8_t bit, uint8_t /*length*/) {
- return bit;
-}
-#elif BYTE_ORDER == BIG_ENDIAN
-inline uint8_t BitPackShift(uint8_t bit, uint8_t length) {
- return 64 - length - bit;
-}
-#else
-#error "Bit packing code isn't written for your byte order."
-#endif
-
-inline uint64_t ReadOff(const void *base, uint64_t bit_off) {
-#if defined(__arm) || defined(__arm__)
- const uint8_t *base_off = reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3);
- uint64_t value64;
- memcpy(&value64, base_off, sizeof(value64));
- return value64;
-#else
- return *reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3));
-#endif
-}
-
-/* Pack integers up to 57 bits using their least significant digits.
- * The length is specified using mask:
- * Assumes mask == (1 << length) - 1 where length <= 57.
- */
-inline uint64_t ReadInt57(const void *base, uint64_t bit_off, uint8_t length, uint64_t mask) {
- return (ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, length)) & mask;
-}
-/* Assumes value < (1 << length) and length <= 57.
- * Assumes the memory is zero initially.
- */
-inline void WriteInt57(void *base, uint64_t bit_off, uint8_t length, uint64_t value) {
-#if defined(__arm) || defined(__arm__)
- uint8_t *base_off = reinterpret_cast<uint8_t*>(base) + (bit_off >> 3);
- uint64_t value64;
- memcpy(&value64, base_off, sizeof(value64));
- value64 |= (value << BitPackShift(bit_off & 7, length));
- memcpy(base_off, &value64, sizeof(value64));
-#else
- *reinterpret_cast<uint64_t*>(reinterpret_cast<uint8_t*>(base) + (bit_off >> 3)) |=
- (value << BitPackShift(bit_off & 7, length));
-#endif
-}
-
-/* Same caveats as above, but for a 25 bit limit. */
-inline uint32_t ReadInt25(const void *base, uint64_t bit_off, uint8_t length, uint32_t mask) {
-#if defined(__arm) || defined(__arm__)
- const uint8_t *base_off = reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3);
- uint32_t value32;
- memcpy(&value32, base_off, sizeof(value32));
- return (value32 >> BitPackShift(bit_off & 7, length)) & mask;
-#else
- return (*reinterpret_cast<const uint32_t*>(reinterpret_cast<const uint8_t*>(base) + (bit_off >> 3)) >> BitPackShift(bit_off & 7, length)) & mask;
-#endif
-}
-
-inline void WriteInt25(void *base, uint64_t bit_off, uint8_t length, uint32_t value) {
-#if defined(__arm) || defined(__arm__)
- uint8_t *base_off = reinterpret_cast<uint8_t*>(base) + (bit_off >> 3);
- uint32_t value32;
- memcpy(&value32, base_off, sizeof(value32));
- value32 |= (value << BitPackShift(bit_off & 7, length));
- memcpy(base_off, &value32, sizeof(value32));
-#else
- *reinterpret_cast<uint32_t*>(reinterpret_cast<uint8_t*>(base) + (bit_off >> 3)) |=
- (value << BitPackShift(bit_off & 7, length));
-#endif
-}
-
-typedef union { float f; uint32_t i; } FloatEnc;
-
-inline float ReadFloat32(const void *base, uint64_t bit_off) {
- FloatEnc encoded;
- encoded.i = ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 32);
- return encoded.f;
-}
-inline void WriteFloat32(void *base, uint64_t bit_off, float value) {
- FloatEnc encoded;
- encoded.f = value;
- WriteInt57(base, bit_off, 32, encoded.i);
-}
-
-const uint32_t kSignBit = 0x80000000;
-
-inline void SetSign(float &to) {
- FloatEnc enc;
- enc.f = to;
- enc.i |= kSignBit;
- to = enc.f;
-}
-
-inline void UnsetSign(float &to) {
- FloatEnc enc;
- enc.f = to;
- enc.i &= ~kSignBit;
- to = enc.f;
-}
-
-inline float ReadNonPositiveFloat31(const void *base, uint64_t bit_off) {
- FloatEnc encoded;
- encoded.i = ReadOff(base, bit_off) >> BitPackShift(bit_off & 7, 31);
- // Sign bit set means negative.
- encoded.i |= kSignBit;
- return encoded.f;
-}
-inline void WriteNonPositiveFloat31(void *base, uint64_t bit_off, float value) {
- FloatEnc encoded;
- encoded.f = value;
- encoded.i &= ~kSignBit;
- WriteInt57(base, bit_off, 31, encoded.i);
-}
-
-void BitPackingSanity();
-
-// Return bits required to store integers upto max_value. Not the most
-// efficient implementation, but this is only called a few times to size tries.
-uint8_t RequiredBits(uint64_t max_value);
-
-struct BitsMask {
- static BitsMask ByMax(uint64_t max_value) {
- BitsMask ret;
- ret.FromMax(max_value);
- return ret;
- }
- static BitsMask ByBits(uint8_t bits) {
- BitsMask ret;
- ret.bits = bits;
- ret.mask = (1ULL << bits) - 1;
- return ret;
- }
- void FromMax(uint64_t max_value) {
- bits = RequiredBits(max_value);
- mask = (1ULL << bits) - 1;
- }
- uint8_t bits;
- uint64_t mask;
-};
-
-struct BitAddress {
- BitAddress(void *in_base, uint64_t in_offset) : base(in_base), offset(in_offset) {}
-
- void *base;
- uint64_t offset;
-};
-
-} // namespace util
-
-#endif // UTIL_BIT_PACKING_H
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/bit_packing_test.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/util/bit_packing_test.cc b/ext/kenlm/util/bit_packing_test.cc
deleted file mode 100644
index c4494b6..0000000
--- a/ext/kenlm/util/bit_packing_test.cc
+++ /dev/null
@@ -1,59 +0,0 @@
-#include "util/bit_packing.hh"
-
-#define BOOST_TEST_MODULE BitPackingTest
-#include <boost/test/unit_test.hpp>
-
-#include <cstring>
-
-namespace util {
-namespace {
-
-const uint64_t test57 = 0x123456789abcdefULL;
-const uint32_t test25 = 0x1234567;
-
-BOOST_AUTO_TEST_CASE(ZeroBit57) {
- char mem[16];
- memset(mem, 0, sizeof(mem));
- WriteInt57(mem, 0, 57, test57);
- BOOST_CHECK_EQUAL(test57, ReadInt57(mem, 0, 57, (1ULL << 57) - 1));
-}
-
-BOOST_AUTO_TEST_CASE(EachBit57) {
- char mem[16];
- for (uint8_t b = 0; b < 8; ++b) {
- memset(mem, 0, sizeof(mem));
- WriteInt57(mem, b, 57, test57);
- BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1));
- }
-}
-
-BOOST_AUTO_TEST_CASE(Consecutive57) {
- char mem[57+8];
- memset(mem, 0, sizeof(mem));
- for (uint64_t b = 0; b < 57 * 8; b += 57) {
- WriteInt57(mem, b, 57, test57);
- BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1));
- }
- for (uint64_t b = 0; b < 57 * 8; b += 57) {
- BOOST_CHECK_EQUAL(test57, ReadInt57(mem, b, 57, (1ULL << 57) - 1));
- }
-}
-
-BOOST_AUTO_TEST_CASE(Consecutive25) {
- char mem[25+8];
- memset(mem, 0, sizeof(mem));
- for (uint64_t b = 0; b < 25 * 8; b += 25) {
- WriteInt25(mem, b, 25, test25);
- BOOST_CHECK_EQUAL(test25, ReadInt25(mem, b, 25, (1ULL << 25) - 1));
- }
- for (uint64_t b = 0; b < 25 * 8; b += 25) {
- BOOST_CHECK_EQUAL(test25, ReadInt25(mem, b, 25, (1ULL << 25) - 1));
- }
-}
-
-BOOST_AUTO_TEST_CASE(Sanity) {
- BitPackingSanity();
-}
-
-} // namespace
-} // namespace util
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/cat_compressed_main.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/util/cat_compressed_main.cc b/ext/kenlm/util/cat_compressed_main.cc
deleted file mode 100644
index 0c7cda9..0000000
--- a/ext/kenlm/util/cat_compressed_main.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-// Like cat but interprets compressed files.
-#include "util/file.hh"
-#include "util/read_compressed.hh"
-
-#include <cstring>
-#include <iostream>
-
-namespace {
-const std::size_t kBufSize = 16384;
-void Copy(util::ReadCompressed &from, int to) {
- util::scoped_malloc buffer(util::MallocOrThrow(kBufSize));
- while (std::size_t amount = from.Read(buffer.get(), kBufSize)) {
- util::WriteOrThrow(to, buffer.get(), amount);
- }
-}
-} // namespace
-
-int main(int argc, char *argv[]) {
- // Lane Schwartz likes -h and --help
- for (int i = 1; i < argc; ++i) {
- char *arg = argv[i];
- if (!strcmp(arg, "--")) break;
- if (!strcmp(arg, "-h") || !strcmp(arg, "--help")) {
- std::cerr <<
- "A cat implementation that interprets compressed files.\n"
- "Usage: " << argv[0] << " [file1] [file2] ...\n"
- "If no file is provided, then stdin is read.\n";
- return 1;
- }
- }
-
- try {
- if (argc == 1) {
- util::ReadCompressed in(0);
- Copy(in, 1);
- } else {
- for (int i = 1; i < argc; ++i) {
- util::ReadCompressed in(util::OpenReadOrThrow(argv[i]));
- Copy(in, 1);
- }
- }
- } catch (const std::exception &e) {
- std::cerr << e.what() << std::endl;
- return 2;
- }
- return 0;
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/double-conversion/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/util/double-conversion/CMakeLists.txt b/ext/kenlm/util/double-conversion/CMakeLists.txt
deleted file mode 100644
index e2cf02a..0000000
--- a/ext/kenlm/util/double-conversion/CMakeLists.txt
+++ /dev/null
@@ -1,39 +0,0 @@
-cmake_minimum_required(VERSION 2.8.8)
-#
-# The KenLM cmake files make use of add_library(... OBJECTS ...)
-#
-# This syntax allows grouping of source files when compiling
-# (effectively creating "fake" libraries based on source subdirs).
-#
-# This syntax was only added in cmake version 2.8.8
-#
-# see http://www.cmake.org/Wiki/CMake/Tutorials/Object_Library
-
-
-# This CMake file was created by Lane Schwartz <do...@gmail.com>
-
-# Explicitly list the source files for this subdirectory
-#
-# If you add any source files to this subdirectory
-# that should be included in the kenlm library,
-# (this excludes any unit test files)
-# you should add them to the following list:
-#
-# In order to allow CMake files in the parent directory
-# to see this variable definition, we set PARENT_SCOPE.
-#
-# In order to set correct paths to these files
-# when this variable is referenced by CMake files in the parent directory,
-# we prefix all files with ${CMAKE_CURRENT_SOURCE_DIR}.
-#
-set(KENLM_UTIL_DOUBLECONVERSION_SOURCE
- ${CMAKE_CURRENT_SOURCE_DIR}/bignum-dtoa.cc
- ${CMAKE_CURRENT_SOURCE_DIR}/bignum.cc
- ${CMAKE_CURRENT_SOURCE_DIR}/cached-powers.cc
- ${CMAKE_CURRENT_SOURCE_DIR}/diy-fp.cc
- ${CMAKE_CURRENT_SOURCE_DIR}/double-conversion.cc
- ${CMAKE_CURRENT_SOURCE_DIR}/fast-dtoa.cc
- ${CMAKE_CURRENT_SOURCE_DIR}/fixed-dtoa.cc
- ${CMAKE_CURRENT_SOURCE_DIR}/strtod.cc
- PARENT_SCOPE)
-
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/double-conversion/Jamfile
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/util/double-conversion/Jamfile b/ext/kenlm/util/double-conversion/Jamfile
deleted file mode 100644
index 7a92afa..0000000
--- a/ext/kenlm/util/double-conversion/Jamfile
+++ /dev/null
@@ -1 +0,0 @@
-fakelib double-conversion : [ glob *.cc ] : : : <include>. ;
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/double-conversion/LICENSE
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/util/double-conversion/LICENSE b/ext/kenlm/util/double-conversion/LICENSE
deleted file mode 100644
index 933718a..0000000
--- a/ext/kenlm/util/double-conversion/LICENSE
+++ /dev/null
@@ -1,26 +0,0 @@
-Copyright 2006-2011, the V8 project authors. All rights reserved.
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
- * Redistributions in binary form must reproduce the above
- copyright notice, this list of conditions and the following
- disclaimer in the documentation and/or other materials provided
- with the distribution.
- * Neither the name of Google Inc. nor the names of its
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/double-conversion/bignum-dtoa.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/util/double-conversion/bignum-dtoa.cc b/ext/kenlm/util/double-conversion/bignum-dtoa.cc
deleted file mode 100644
index 3d217bf..0000000
--- a/ext/kenlm/util/double-conversion/bignum-dtoa.cc
+++ /dev/null
@@ -1,640 +0,0 @@
-// Copyright 2010 the V8 project authors. All rights reserved.
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following
-// disclaimer in the documentation and/or other materials provided
-// with the distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived
-// from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include <cmath>
-
-#include "bignum-dtoa.h"
-
-#include "bignum.h"
-#include "ieee.h"
-
-namespace double_conversion {
-
-static int NormalizedExponent(uint64_t significand, int exponent) {
- ASSERT(significand != 0);
- while ((significand & Double::kHiddenBit) == 0) {
- significand = significand << 1;
- exponent = exponent - 1;
- }
- return exponent;
-}
-
-
-// Forward declarations:
-// Returns an estimation of k such that 10^(k-1) <= v < 10^k.
-static int EstimatePower(int exponent);
-// Computes v / 10^estimated_power exactly, as a ratio of two bignums, numerator
-// and denominator.
-static void InitialScaledStartValues(uint64_t significand,
- int exponent,
- bool lower_boundary_is_closer,
- int estimated_power,
- bool need_boundary_deltas,
- Bignum* numerator,
- Bignum* denominator,
- Bignum* delta_minus,
- Bignum* delta_plus);
-// Multiplies numerator/denominator so that its values lies in the range 1-10.
-// Returns decimal_point s.t.
-// v = numerator'/denominator' * 10^(decimal_point-1)
-// where numerator' and denominator' are the values of numerator and
-// denominator after the call to this function.
-static void FixupMultiply10(int estimated_power, bool is_even,
- int* decimal_point,
- Bignum* numerator, Bignum* denominator,
- Bignum* delta_minus, Bignum* delta_plus);
-// Generates digits from the left to the right and stops when the generated
-// digits yield the shortest decimal representation of v.
-static void GenerateShortestDigits(Bignum* numerator, Bignum* denominator,
- Bignum* delta_minus, Bignum* delta_plus,
- bool is_even,
- Vector<char> buffer, int* length);
-// Generates 'requested_digits' after the decimal point.
-static void BignumToFixed(int requested_digits, int* decimal_point,
- Bignum* numerator, Bignum* denominator,
- Vector<char>(buffer), int* length);
-// Generates 'count' digits of numerator/denominator.
-// Once 'count' digits have been produced rounds the result depending on the
-// remainder (remainders of exactly .5 round upwards). Might update the
-// decimal_point when rounding up (for example for 0.9999).
-static void GenerateCountedDigits(int count, int* decimal_point,
- Bignum* numerator, Bignum* denominator,
- Vector<char>(buffer), int* length);
-
-
-void BignumDtoa(double v, BignumDtoaMode mode, int requested_digits,
- Vector<char> buffer, int* length, int* decimal_point) {
- ASSERT(v > 0);
- ASSERT(!Double(v).IsSpecial());
- uint64_t significand;
- int exponent;
- bool lower_boundary_is_closer;
- if (mode == BIGNUM_DTOA_SHORTEST_SINGLE) {
- float f = static_cast<float>(v);
- ASSERT(f == v);
- significand = Single(f).Significand();
- exponent = Single(f).Exponent();
- lower_boundary_is_closer = Single(f).LowerBoundaryIsCloser();
- } else {
- significand = Double(v).Significand();
- exponent = Double(v).Exponent();
- lower_boundary_is_closer = Double(v).LowerBoundaryIsCloser();
- }
- bool need_boundary_deltas =
- (mode == BIGNUM_DTOA_SHORTEST || mode == BIGNUM_DTOA_SHORTEST_SINGLE);
-
- bool is_even = (significand & 1) == 0;
- int normalized_exponent = NormalizedExponent(significand, exponent);
- // estimated_power might be too low by 1.
- int estimated_power = EstimatePower(normalized_exponent);
-
- // Shortcut for Fixed.
- // The requested digits correspond to the digits after the point. If the
- // number is much too small, then there is no need in trying to get any
- // digits.
- if (mode == BIGNUM_DTOA_FIXED && -estimated_power - 1 > requested_digits) {
- buffer[0] = '\0';
- *length = 0;
- // Set decimal-point to -requested_digits. This is what Gay does.
- // Note that it should not have any effect anyways since the string is
- // empty.
- *decimal_point = -requested_digits;
- return;
- }
-
- Bignum numerator;
- Bignum denominator;
- Bignum delta_minus;
- Bignum delta_plus;
- // Make sure the bignum can grow large enough. The smallest double equals
- // 4e-324. In this case the denominator needs fewer than 324*4 binary digits.
- // The maximum double is 1.7976931348623157e308 which needs fewer than
- // 308*4 binary digits.
- ASSERT(Bignum::kMaxSignificantBits >= 324*4);
- InitialScaledStartValues(significand, exponent, lower_boundary_is_closer,
- estimated_power, need_boundary_deltas,
- &numerator, &denominator,
- &delta_minus, &delta_plus);
- // We now have v = (numerator / denominator) * 10^estimated_power.
- FixupMultiply10(estimated_power, is_even, decimal_point,
- &numerator, &denominator,
- &delta_minus, &delta_plus);
- // We now have v = (numerator / denominator) * 10^(decimal_point-1), and
- // 1 <= (numerator + delta_plus) / denominator < 10
- switch (mode) {
- case BIGNUM_DTOA_SHORTEST:
- case BIGNUM_DTOA_SHORTEST_SINGLE:
- GenerateShortestDigits(&numerator, &denominator,
- &delta_minus, &delta_plus,
- is_even, buffer, length);
- break;
- case BIGNUM_DTOA_FIXED:
- BignumToFixed(requested_digits, decimal_point,
- &numerator, &denominator,
- buffer, length);
- break;
- case BIGNUM_DTOA_PRECISION:
- GenerateCountedDigits(requested_digits, decimal_point,
- &numerator, &denominator,
- buffer, length);
- break;
- default:
- UNREACHABLE();
- }
- buffer[*length] = '\0';
-}
-
-
-// The procedure starts generating digits from the left to the right and stops
-// when the generated digits yield the shortest decimal representation of v. A
-// decimal representation of v is a number lying closer to v than to any other
-// double, so it converts to v when read.
-//
-// This is true if d, the decimal representation, is between m- and m+, the
-// upper and lower boundaries. d must be strictly between them if !is_even.
-// m- := (numerator - delta_minus) / denominator
-// m+ := (numerator + delta_plus) / denominator
-//
-// Precondition: 0 <= (numerator+delta_plus) / denominator < 10.
-// If 1 <= (numerator+delta_plus) / denominator < 10 then no leading 0 digit
-// will be produced. This should be the standard precondition.
-static void GenerateShortestDigits(Bignum* numerator, Bignum* denominator,
- Bignum* delta_minus, Bignum* delta_plus,
- bool is_even,
- Vector<char> buffer, int* length) {
- // Small optimization: if delta_minus and delta_plus are the same just reuse
- // one of the two bignums.
- if (Bignum::Equal(*delta_minus, *delta_plus)) {
- delta_plus = delta_minus;
- }
- *length = 0;
- while (true) {
- uint16_t digit;
- digit = numerator->DivideModuloIntBignum(*denominator);
- ASSERT(digit <= 9); // digit is a uint16_t and therefore always positive.
- // digit = numerator / denominator (integer division).
- // numerator = numerator % denominator.
- buffer[(*length)++] = digit + '0';
-
- // Can we stop already?
- // If the remainder of the division is less than the distance to the lower
- // boundary we can stop. In this case we simply round down (discarding the
- // remainder).
- // Similarly we test if we can round up (using the upper boundary).
- bool in_delta_room_minus;
- bool in_delta_room_plus;
- if (is_even) {
- in_delta_room_minus = Bignum::LessEqual(*numerator, *delta_minus);
- } else {
- in_delta_room_minus = Bignum::Less(*numerator, *delta_minus);
- }
- if (is_even) {
- in_delta_room_plus =
- Bignum::PlusCompare(*numerator, *delta_plus, *denominator) >= 0;
- } else {
- in_delta_room_plus =
- Bignum::PlusCompare(*numerator, *delta_plus, *denominator) > 0;
- }
- if (!in_delta_room_minus && !in_delta_room_plus) {
- // Prepare for next iteration.
- numerator->Times10();
- delta_minus->Times10();
- // We optimized delta_plus to be equal to delta_minus (if they share the
- // same value). So don't multiply delta_plus if they point to the same
- // object.
- if (delta_minus != delta_plus) {
- delta_plus->Times10();
- }
- } else if (in_delta_room_minus && in_delta_room_plus) {
- // Let's see if 2*numerator < denominator.
- // If yes, then the next digit would be < 5 and we can round down.
- int compare = Bignum::PlusCompare(*numerator, *numerator, *denominator);
- if (compare < 0) {
- // Remaining digits are less than .5. -> Round down (== do nothing).
- } else if (compare > 0) {
- // Remaining digits are more than .5 of denominator. -> Round up.
- // Note that the last digit could not be a '9' as otherwise the whole
- // loop would have stopped earlier.
- // We still have an assert here in case the preconditions were not
- // satisfied.
- ASSERT(buffer[(*length) - 1] != '9');
- buffer[(*length) - 1]++;
- } else {
- // Halfway case.
- // TODO(floitsch): need a way to solve half-way cases.
- // For now let's round towards even (since this is what Gay seems to
- // do).
-
- if ((buffer[(*length) - 1] - '0') % 2 == 0) {
- // Round down => Do nothing.
- } else {
- ASSERT(buffer[(*length) - 1] != '9');
- buffer[(*length) - 1]++;
- }
- }
- return;
- } else if (in_delta_room_minus) {
- // Round down (== do nothing).
- return;
- } else { // in_delta_room_plus
- // Round up.
- // Note again that the last digit could not be '9' since this would have
- // stopped the loop earlier.
- // We still have an ASSERT here, in case the preconditions were not
- // satisfied.
- ASSERT(buffer[(*length) -1] != '9');
- buffer[(*length) - 1]++;
- return;
- }
- }
-}
-
-
-// Let v = numerator / denominator < 10.
-// Then we generate 'count' digits of d = x.xxxxx... (without the decimal point)
-// from left to right. Once 'count' digits have been produced we decide wether
-// to round up or down. Remainders of exactly .5 round upwards. Numbers such
-// as 9.999999 propagate a carry all the way, and change the
-// exponent (decimal_point), when rounding upwards.
-static void GenerateCountedDigits(int count, int* decimal_point,
- Bignum* numerator, Bignum* denominator,
- Vector<char>(buffer), int* length) {
- ASSERT(count >= 0);
- for (int i = 0; i < count - 1; ++i) {
- uint16_t digit;
- digit = numerator->DivideModuloIntBignum(*denominator);
- ASSERT(digit <= 9); // digit is a uint16_t and therefore always positive.
- // digit = numerator / denominator (integer division).
- // numerator = numerator % denominator.
- buffer[i] = digit + '0';
- // Prepare for next iteration.
- numerator->Times10();
- }
- // Generate the last digit.
- uint16_t digit;
- digit = numerator->DivideModuloIntBignum(*denominator);
- if (Bignum::PlusCompare(*numerator, *numerator, *denominator) >= 0) {
- digit++;
- }
- buffer[count - 1] = digit + '0';
- // Correct bad digits (in case we had a sequence of '9's). Propagate the
- // carry until we hat a non-'9' or til we reach the first digit.
- for (int i = count - 1; i > 0; --i) {
- if (buffer[i] != '0' + 10) break;
- buffer[i] = '0';
- buffer[i - 1]++;
- }
- if (buffer[0] == '0' + 10) {
- // Propagate a carry past the top place.
- buffer[0] = '1';
- (*decimal_point)++;
- }
- *length = count;
-}
-
-
-// Generates 'requested_digits' after the decimal point. It might omit
-// trailing '0's. If the input number is too small then no digits at all are
-// generated (ex.: 2 fixed digits for 0.00001).
-//
-// Input verifies: 1 <= (numerator + delta) / denominator < 10.
-static void BignumToFixed(int requested_digits, int* decimal_point,
- Bignum* numerator, Bignum* denominator,
- Vector<char>(buffer), int* length) {
- // Note that we have to look at more than just the requested_digits, since
- // a number could be rounded up. Example: v=0.5 with requested_digits=0.
- // Even though the power of v equals 0 we can't just stop here.
- if (-(*decimal_point) > requested_digits) {
- // The number is definitively too small.
- // Ex: 0.001 with requested_digits == 1.
- // Set decimal-point to -requested_digits. This is what Gay does.
- // Note that it should not have any effect anyways since the string is
- // empty.
- *decimal_point = -requested_digits;
- *length = 0;
- return;
- } else if (-(*decimal_point) == requested_digits) {
- // We only need to verify if the number rounds down or up.
- // Ex: 0.04 and 0.06 with requested_digits == 1.
- ASSERT(*decimal_point == -requested_digits);
- // Initially the fraction lies in range (1, 10]. Multiply the denominator
- // by 10 so that we can compare more easily.
- denominator->Times10();
- if (Bignum::PlusCompare(*numerator, *numerator, *denominator) >= 0) {
- // If the fraction is >= 0.5 then we have to include the rounded
- // digit.
- buffer[0] = '1';
- *length = 1;
- (*decimal_point)++;
- } else {
- // Note that we caught most of similar cases earlier.
- *length = 0;
- }
- return;
- } else {
- // The requested digits correspond to the digits after the point.
- // The variable 'needed_digits' includes the digits before the point.
- int needed_digits = (*decimal_point) + requested_digits;
- GenerateCountedDigits(needed_digits, decimal_point,
- numerator, denominator,
- buffer, length);
- }
-}
-
-
-// Returns an estimation of k such that 10^(k-1) <= v < 10^k where
-// v = f * 2^exponent and 2^52 <= f < 2^53.
-// v is hence a normalized double with the given exponent. The output is an
-// approximation for the exponent of the decimal approimation .digits * 10^k.
-//
-// The result might undershoot by 1 in which case 10^k <= v < 10^k+1.
-// Note: this property holds for v's upper boundary m+ too.
-// 10^k <= m+ < 10^k+1.
-// (see explanation below).
-//
-// Examples:
-// EstimatePower(0) => 16
-// EstimatePower(-52) => 0
-//
-// Note: e >= 0 => EstimatedPower(e) > 0. No similar claim can be made for e<0.
-static int EstimatePower(int exponent) {
- // This function estimates log10 of v where v = f*2^e (with e == exponent).
- // Note that 10^floor(log10(v)) <= v, but v <= 10^ceil(log10(v)).
- // Note that f is bounded by its container size. Let p = 53 (the double's
- // significand size). Then 2^(p-1) <= f < 2^p.
- //
- // Given that log10(v) == log2(v)/log2(10) and e+(len(f)-1) is quite close
- // to log2(v) the function is simplified to (e+(len(f)-1)/log2(10)).
- // The computed number undershoots by less than 0.631 (when we compute log3
- // and not log10).
- //
- // Optimization: since we only need an approximated result this computation
- // can be performed on 64 bit integers. On x86/x64 architecture the speedup is
- // not really measurable, though.
- //
- // Since we want to avoid overshooting we decrement by 1e10 so that
- // floating-point imprecisions don't affect us.
- //
- // Explanation for v's boundary m+: the computation takes advantage of
- // the fact that 2^(p-1) <= f < 2^p. Boundaries still satisfy this requirement
- // (even for denormals where the delta can be much more important).
-
- const double k1Log10 = 0.30102999566398114; // 1/lg(10)
-
- // For doubles len(f) == 53 (don't forget the hidden bit).
- const int kSignificandSize = Double::kSignificandSize;
- double estimate = ceil((exponent + kSignificandSize - 1) * k1Log10 - 1e-10);
- return static_cast<int>(estimate);
-}
-
-
-// See comments for InitialScaledStartValues.
-static void InitialScaledStartValuesPositiveExponent(
- uint64_t significand, int exponent,
- int estimated_power, bool need_boundary_deltas,
- Bignum* numerator, Bignum* denominator,
- Bignum* delta_minus, Bignum* delta_plus) {
- // A positive exponent implies a positive power.
- ASSERT(estimated_power >= 0);
- // Since the estimated_power is positive we simply multiply the denominator
- // by 10^estimated_power.
-
- // numerator = v.
- numerator->AssignUInt64(significand);
- numerator->ShiftLeft(exponent);
- // denominator = 10^estimated_power.
- denominator->AssignPowerUInt16(10, estimated_power);
-
- if (need_boundary_deltas) {
- // Introduce a common denominator so that the deltas to the boundaries are
- // integers.
- denominator->ShiftLeft(1);
- numerator->ShiftLeft(1);
- // Let v = f * 2^e, then m+ - v = 1/2 * 2^e; With the common
- // denominator (of 2) delta_plus equals 2^e.
- delta_plus->AssignUInt16(1);
- delta_plus->ShiftLeft(exponent);
- // Same for delta_minus. The adjustments if f == 2^p-1 are done later.
- delta_minus->AssignUInt16(1);
- delta_minus->ShiftLeft(exponent);
- }
-}
-
-
-// See comments for InitialScaledStartValues
-static void InitialScaledStartValuesNegativeExponentPositivePower(
- uint64_t significand, int exponent,
- int estimated_power, bool need_boundary_deltas,
- Bignum* numerator, Bignum* denominator,
- Bignum* delta_minus, Bignum* delta_plus) {
- // v = f * 2^e with e < 0, and with estimated_power >= 0.
- // This means that e is close to 0 (have a look at how estimated_power is
- // computed).
-
- // numerator = significand
- // since v = significand * 2^exponent this is equivalent to
- // numerator = v * / 2^-exponent
- numerator->AssignUInt64(significand);
- // denominator = 10^estimated_power * 2^-exponent (with exponent < 0)
- denominator->AssignPowerUInt16(10, estimated_power);
- denominator->ShiftLeft(-exponent);
-
- if (need_boundary_deltas) {
- // Introduce a common denominator so that the deltas to the boundaries are
- // integers.
- denominator->ShiftLeft(1);
- numerator->ShiftLeft(1);
- // Let v = f * 2^e, then m+ - v = 1/2 * 2^e; With the common
- // denominator (of 2) delta_plus equals 2^e.
- // Given that the denominator already includes v's exponent the distance
- // to the boundaries is simply 1.
- delta_plus->AssignUInt16(1);
- // Same for delta_minus. The adjustments if f == 2^p-1 are done later.
- delta_minus->AssignUInt16(1);
- }
-}
-
-
-// See comments for InitialScaledStartValues
-static void InitialScaledStartValuesNegativeExponentNegativePower(
- uint64_t significand, int exponent,
- int estimated_power, bool need_boundary_deltas,
- Bignum* numerator, Bignum* denominator,
- Bignum* delta_minus, Bignum* delta_plus) {
- // Instead of multiplying the denominator with 10^estimated_power we
- // multiply all values (numerator and deltas) by 10^-estimated_power.
-
- // Use numerator as temporary container for power_ten.
- Bignum* power_ten = numerator;
- power_ten->AssignPowerUInt16(10, -estimated_power);
-
- if (need_boundary_deltas) {
- // Since power_ten == numerator we must make a copy of 10^estimated_power
- // before we complete the computation of the numerator.
- // delta_plus = delta_minus = 10^estimated_power
- delta_plus->AssignBignum(*power_ten);
- delta_minus->AssignBignum(*power_ten);
- }
-
- // numerator = significand * 2 * 10^-estimated_power
- // since v = significand * 2^exponent this is equivalent to
- // numerator = v * 10^-estimated_power * 2 * 2^-exponent.
- // Remember: numerator has been abused as power_ten. So no need to assign it
- // to itself.
- ASSERT(numerator == power_ten);
- numerator->MultiplyByUInt64(significand);
-
- // denominator = 2 * 2^-exponent with exponent < 0.
- denominator->AssignUInt16(1);
- denominator->ShiftLeft(-exponent);
-
- if (need_boundary_deltas) {
- // Introduce a common denominator so that the deltas to the boundaries are
- // integers.
- numerator->ShiftLeft(1);
- denominator->ShiftLeft(1);
- // With this shift the boundaries have their correct value, since
- // delta_plus = 10^-estimated_power, and
- // delta_minus = 10^-estimated_power.
- // These assignments have been done earlier.
- // The adjustments if f == 2^p-1 (lower boundary is closer) are done later.
- }
-}
-
-
-// Let v = significand * 2^exponent.
-// Computes v / 10^estimated_power exactly, as a ratio of two bignums, numerator
-// and denominator. The functions GenerateShortestDigits and
-// GenerateCountedDigits will then convert this ratio to its decimal
-// representation d, with the required accuracy.
-// Then d * 10^estimated_power is the representation of v.
-// (Note: the fraction and the estimated_power might get adjusted before
-// generating the decimal representation.)
-//
-// The initial start values consist of:
-// - a scaled numerator: s.t. numerator/denominator == v / 10^estimated_power.
-// - a scaled (common) denominator.
-// optionally (used by GenerateShortestDigits to decide if it has the shortest
-// decimal converting back to v):
-// - v - m-: the distance to the lower boundary.
-// - m+ - v: the distance to the upper boundary.
-//
-// v, m+, m-, and therefore v - m- and m+ - v all share the same denominator.
-//
-// Let ep == estimated_power, then the returned values will satisfy:
-// v / 10^ep = numerator / denominator.
-// v's boundarys m- and m+:
-// m- / 10^ep == v / 10^ep - delta_minus / denominator
-// m+ / 10^ep == v / 10^ep + delta_plus / denominator
-// Or in other words:
-// m- == v - delta_minus * 10^ep / denominator;
-// m+ == v + delta_plus * 10^ep / denominator;
-//
-// Since 10^(k-1) <= v < 10^k (with k == estimated_power)
-// or 10^k <= v < 10^(k+1)
-// we then have 0.1 <= numerator/denominator < 1
-// or 1 <= numerator/denominator < 10
-//
-// It is then easy to kickstart the digit-generation routine.
-//
-// The boundary-deltas are only filled if the mode equals BIGNUM_DTOA_SHORTEST
-// or BIGNUM_DTOA_SHORTEST_SINGLE.
-
-static void InitialScaledStartValues(uint64_t significand,
- int exponent,
- bool lower_boundary_is_closer,
- int estimated_power,
- bool need_boundary_deltas,
- Bignum* numerator,
- Bignum* denominator,
- Bignum* delta_minus,
- Bignum* delta_plus) {
- if (exponent >= 0) {
- InitialScaledStartValuesPositiveExponent(
- significand, exponent, estimated_power, need_boundary_deltas,
- numerator, denominator, delta_minus, delta_plus);
- } else if (estimated_power >= 0) {
- InitialScaledStartValuesNegativeExponentPositivePower(
- significand, exponent, estimated_power, need_boundary_deltas,
- numerator, denominator, delta_minus, delta_plus);
- } else {
- InitialScaledStartValuesNegativeExponentNegativePower(
- significand, exponent, estimated_power, need_boundary_deltas,
- numerator, denominator, delta_minus, delta_plus);
- }
-
- if (need_boundary_deltas && lower_boundary_is_closer) {
- // The lower boundary is closer at half the distance of "normal" numbers.
- // Increase the common denominator and adapt all but the delta_minus.
- denominator->ShiftLeft(1); // *2
- numerator->ShiftLeft(1); // *2
- delta_plus->ShiftLeft(1); // *2
- }
-}
-
-
-// This routine multiplies numerator/denominator so that its values lies in the
-// range 1-10. That is after a call to this function we have:
-// 1 <= (numerator + delta_plus) /denominator < 10.
-// Let numerator the input before modification and numerator' the argument
-// after modification, then the output-parameter decimal_point is such that
-// numerator / denominator * 10^estimated_power ==
-// numerator' / denominator' * 10^(decimal_point - 1)
-// In some cases estimated_power was too low, and this is already the case. We
-// then simply adjust the power so that 10^(k-1) <= v < 10^k (with k ==
-// estimated_power) but do not touch the numerator or denominator.
-// Otherwise the routine multiplies the numerator and the deltas by 10.
-static void FixupMultiply10(int estimated_power, bool is_even,
- int* decimal_point,
- Bignum* numerator, Bignum* denominator,
- Bignum* delta_minus, Bignum* delta_plus) {
- bool in_range;
- if (is_even) {
- // For IEEE doubles half-way cases (in decimal system numbers ending with 5)
- // are rounded to the closest floating-point number with even significand.
- in_range = Bignum::PlusCompare(*numerator, *delta_plus, *denominator) >= 0;
- } else {
- in_range = Bignum::PlusCompare(*numerator, *delta_plus, *denominator) > 0;
- }
- if (in_range) {
- // Since numerator + delta_plus >= denominator we already have
- // 1 <= numerator/denominator < 10. Simply update the estimated_power.
- *decimal_point = estimated_power + 1;
- } else {
- *decimal_point = estimated_power;
- numerator->Times10();
- if (Bignum::Equal(*delta_minus, *delta_plus)) {
- delta_minus->Times10();
- delta_plus->AssignBignum(*delta_minus);
- } else {
- delta_minus->Times10();
- delta_plus->Times10();
- }
- }
-}
-
-} // namespace double_conversion
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/double-conversion/bignum-dtoa.h
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/util/double-conversion/bignum-dtoa.h b/ext/kenlm/util/double-conversion/bignum-dtoa.h
deleted file mode 100644
index 34b9619..0000000
--- a/ext/kenlm/util/double-conversion/bignum-dtoa.h
+++ /dev/null
@@ -1,84 +0,0 @@
-// Copyright 2010 the V8 project authors. All rights reserved.
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following
-// disclaimer in the documentation and/or other materials provided
-// with the distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived
-// from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#ifndef DOUBLE_CONVERSION_BIGNUM_DTOA_H_
-#define DOUBLE_CONVERSION_BIGNUM_DTOA_H_
-
-#include "utils.h"
-
-namespace double_conversion {
-
-enum BignumDtoaMode {
- // Return the shortest correct representation.
- // For example the output of 0.299999999999999988897 is (the less accurate but
- // correct) 0.3.
- BIGNUM_DTOA_SHORTEST,
- // Same as BIGNUM_DTOA_SHORTEST but for single-precision floats.
- BIGNUM_DTOA_SHORTEST_SINGLE,
- // Return a fixed number of digits after the decimal point.
- // For instance fixed(0.1, 4) becomes 0.1000
- // If the input number is big, the output will be big.
- BIGNUM_DTOA_FIXED,
- // Return a fixed number of digits, no matter what the exponent is.
- BIGNUM_DTOA_PRECISION
-};
-
-// Converts the given double 'v' to ascii.
-// The result should be interpreted as buffer * 10^(point-length).
-// The buffer will be null-terminated.
-//
-// The input v must be > 0 and different from NaN, and Infinity.
-//
-// The output depends on the given mode:
-// - SHORTEST: produce the least amount of digits for which the internal
-// identity requirement is still satisfied. If the digits are printed
-// (together with the correct exponent) then reading this number will give
-// 'v' again. The buffer will choose the representation that is closest to
-// 'v'. If there are two at the same distance, than the number is round up.
-// In this mode the 'requested_digits' parameter is ignored.
-// - FIXED: produces digits necessary to print a given number with
-// 'requested_digits' digits after the decimal point. The produced digits
-// might be too short in which case the caller has to fill the gaps with '0's.
-// Example: toFixed(0.001, 5) is allowed to return buffer="1", point=-2.
-// Halfway cases are rounded up. The call toFixed(0.15, 2) thus returns
-// buffer="2", point=0.
-// Note: the length of the returned buffer has no meaning wrt the significance
-// of its digits. That is, just because it contains '0's does not mean that
-// any other digit would not satisfy the internal identity requirement.
-// - PRECISION: produces 'requested_digits' where the first digit is not '0'.
-// Even though the length of produced digits usually equals
-// 'requested_digits', the function is allowed to return fewer digits, in
-// which case the caller has to fill the missing digits with '0's.
-// Halfway cases are again rounded up.
-// 'BignumDtoa' expects the given buffer to be big enough to hold all digits
-// and a terminating null-character.
-void BignumDtoa(double v, BignumDtoaMode mode, int requested_digits,
- Vector<char> buffer, int* length, int* point);
-
-} // namespace double_conversion
-
-#endif // DOUBLE_CONVERSION_BIGNUM_DTOA_H_
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/double-conversion/bignum.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/util/double-conversion/bignum.cc b/ext/kenlm/util/double-conversion/bignum.cc
deleted file mode 100644
index 747491a..0000000
--- a/ext/kenlm/util/double-conversion/bignum.cc
+++ /dev/null
@@ -1,764 +0,0 @@
-// Copyright 2010 the V8 project authors. All rights reserved.
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following
-// disclaimer in the documentation and/or other materials provided
-// with the distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived
-// from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include "bignum.h"
-#include "utils.h"
-
-namespace double_conversion {
-
-Bignum::Bignum()
- : bigits_(bigits_buffer_, kBigitCapacity), used_digits_(0), exponent_(0) {
- for (int i = 0; i < kBigitCapacity; ++i) {
- bigits_[i] = 0;
- }
-}
-
-
-template<typename S>
-static int BitSize(S value) {
- return 8 * sizeof(value);
-}
-
-// Guaranteed to lie in one Bigit.
-void Bignum::AssignUInt16(uint16_t value) {
- ASSERT(kBigitSize >= BitSize(value));
- Zero();
- if (value == 0) return;
-
- EnsureCapacity(1);
- bigits_[0] = value;
- used_digits_ = 1;
-}
-
-
-void Bignum::AssignUInt64(uint64_t value) {
- const int kUInt64Size = 64;
-
- Zero();
- if (value == 0) return;
-
- int needed_bigits = kUInt64Size / kBigitSize + 1;
- EnsureCapacity(needed_bigits);
- for (int i = 0; i < needed_bigits; ++i) {
- bigits_[i] = value & kBigitMask;
- value = value >> kBigitSize;
- }
- used_digits_ = needed_bigits;
- Clamp();
-}
-
-
-void Bignum::AssignBignum(const Bignum& other) {
- exponent_ = other.exponent_;
- for (int i = 0; i < other.used_digits_; ++i) {
- bigits_[i] = other.bigits_[i];
- }
- // Clear the excess digits (if there were any).
- for (int i = other.used_digits_; i < used_digits_; ++i) {
- bigits_[i] = 0;
- }
- used_digits_ = other.used_digits_;
-}
-
-
-static uint64_t ReadUInt64(Vector<const char> buffer,
- int from,
- int digits_to_read) {
- uint64_t result = 0;
- for (int i = from; i < from + digits_to_read; ++i) {
- int digit = buffer[i] - '0';
- ASSERT(0 <= digit && digit <= 9);
- result = result * 10 + digit;
- }
- return result;
-}
-
-
-void Bignum::AssignDecimalString(Vector<const char> value) {
- // 2^64 = 18446744073709551616 > 10^19
- const int kMaxUint64DecimalDigits = 19;
- Zero();
- int length = value.length();
- int pos = 0;
- // Let's just say that each digit needs 4 bits.
- while (length >= kMaxUint64DecimalDigits) {
- uint64_t digits = ReadUInt64(value, pos, kMaxUint64DecimalDigits);
- pos += kMaxUint64DecimalDigits;
- length -= kMaxUint64DecimalDigits;
- MultiplyByPowerOfTen(kMaxUint64DecimalDigits);
- AddUInt64(digits);
- }
- uint64_t digits = ReadUInt64(value, pos, length);
- MultiplyByPowerOfTen(length);
- AddUInt64(digits);
- Clamp();
-}
-
-
-static int HexCharValue(char c) {
- if ('0' <= c && c <= '9') return c - '0';
- if ('a' <= c && c <= 'f') return 10 + c - 'a';
- if ('A' <= c && c <= 'F') return 10 + c - 'A';
- UNREACHABLE();
- return 0; // To make compiler happy.
-}
-
-
-void Bignum::AssignHexString(Vector<const char> value) {
- Zero();
- int length = value.length();
-
- int needed_bigits = length * 4 / kBigitSize + 1;
- EnsureCapacity(needed_bigits);
- int string_index = length - 1;
- for (int i = 0; i < needed_bigits - 1; ++i) {
- // These bigits are guaranteed to be "full".
- Chunk current_bigit = 0;
- for (int j = 0; j < kBigitSize / 4; j++) {
- current_bigit += HexCharValue(value[string_index--]) << (j * 4);
- }
- bigits_[i] = current_bigit;
- }
- used_digits_ = needed_bigits - 1;
-
- Chunk most_significant_bigit = 0; // Could be = 0;
- for (int j = 0; j <= string_index; ++j) {
- most_significant_bigit <<= 4;
- most_significant_bigit += HexCharValue(value[j]);
- }
- if (most_significant_bigit != 0) {
- bigits_[used_digits_] = most_significant_bigit;
- used_digits_++;
- }
- Clamp();
-}
-
-
-void Bignum::AddUInt64(uint64_t operand) {
- if (operand == 0) return;
- Bignum other;
- other.AssignUInt64(operand);
- AddBignum(other);
-}
-
-
-void Bignum::AddBignum(const Bignum& other) {
- ASSERT(IsClamped());
- ASSERT(other.IsClamped());
-
- // If this has a greater exponent than other append zero-bigits to this.
- // After this call exponent_ <= other.exponent_.
- Align(other);
-
- // There are two possibilities:
- // aaaaaaaaaaa 0000 (where the 0s represent a's exponent)
- // bbbbb 00000000
- // ----------------
- // ccccccccccc 0000
- // or
- // aaaaaaaaaa 0000
- // bbbbbbbbb 0000000
- // -----------------
- // cccccccccccc 0000
- // In both cases we might need a carry bigit.
-
- EnsureCapacity(1 + Max(BigitLength(), other.BigitLength()) - exponent_);
- Chunk carry = 0;
- int bigit_pos = other.exponent_ - exponent_;
- ASSERT(bigit_pos >= 0);
- for (int i = 0; i < other.used_digits_; ++i) {
- Chunk sum = bigits_[bigit_pos] + other.bigits_[i] + carry;
- bigits_[bigit_pos] = sum & kBigitMask;
- carry = sum >> kBigitSize;
- bigit_pos++;
- }
-
- while (carry != 0) {
- Chunk sum = bigits_[bigit_pos] + carry;
- bigits_[bigit_pos] = sum & kBigitMask;
- carry = sum >> kBigitSize;
- bigit_pos++;
- }
- used_digits_ = Max(bigit_pos, used_digits_);
- ASSERT(IsClamped());
-}
-
-
-void Bignum::SubtractBignum(const Bignum& other) {
- ASSERT(IsClamped());
- ASSERT(other.IsClamped());
- // We require this to be bigger than other.
- ASSERT(LessEqual(other, *this));
-
- Align(other);
-
- int offset = other.exponent_ - exponent_;
- Chunk borrow = 0;
- int i;
- for (i = 0; i < other.used_digits_; ++i) {
- ASSERT((borrow == 0) || (borrow == 1));
- Chunk difference = bigits_[i + offset] - other.bigits_[i] - borrow;
- bigits_[i + offset] = difference & kBigitMask;
- borrow = difference >> (kChunkSize - 1);
- }
- while (borrow != 0) {
- Chunk difference = bigits_[i + offset] - borrow;
- bigits_[i + offset] = difference & kBigitMask;
- borrow = difference >> (kChunkSize - 1);
- ++i;
- }
- Clamp();
-}
-
-
-void Bignum::ShiftLeft(int shift_amount) {
- if (used_digits_ == 0) return;
- exponent_ += shift_amount / kBigitSize;
- int local_shift = shift_amount % kBigitSize;
- EnsureCapacity(used_digits_ + 1);
- BigitsShiftLeft(local_shift);
-}
-
-
-void Bignum::MultiplyByUInt32(uint32_t factor) {
- if (factor == 1) return;
- if (factor == 0) {
- Zero();
- return;
- }
- if (used_digits_ == 0) return;
-
- // The product of a bigit with the factor is of size kBigitSize + 32.
- // Assert that this number + 1 (for the carry) fits into double chunk.
- ASSERT(kDoubleChunkSize >= kBigitSize + 32 + 1);
- DoubleChunk carry = 0;
- for (int i = 0; i < used_digits_; ++i) {
- DoubleChunk product = static_cast<DoubleChunk>(factor) * bigits_[i] + carry;
- bigits_[i] = static_cast<Chunk>(product & kBigitMask);
- carry = (product >> kBigitSize);
- }
- while (carry != 0) {
- EnsureCapacity(used_digits_ + 1);
- bigits_[used_digits_] = carry & kBigitMask;
- used_digits_++;
- carry >>= kBigitSize;
- }
-}
-
-
-void Bignum::MultiplyByUInt64(uint64_t factor) {
- if (factor == 1) return;
- if (factor == 0) {
- Zero();
- return;
- }
- ASSERT(kBigitSize < 32);
- uint64_t carry = 0;
- uint64_t low = factor & 0xFFFFFFFF;
- uint64_t high = factor >> 32;
- for (int i = 0; i < used_digits_; ++i) {
- uint64_t product_low = low * bigits_[i];
- uint64_t product_high = high * bigits_[i];
- uint64_t tmp = (carry & kBigitMask) + product_low;
- bigits_[i] = tmp & kBigitMask;
- carry = (carry >> kBigitSize) + (tmp >> kBigitSize) +
- (product_high << (32 - kBigitSize));
- }
- while (carry != 0) {
- EnsureCapacity(used_digits_ + 1);
- bigits_[used_digits_] = carry & kBigitMask;
- used_digits_++;
- carry >>= kBigitSize;
- }
-}
-
-
-void Bignum::MultiplyByPowerOfTen(int exponent) {
- const uint64_t kFive27 = UINT64_2PART_C(0x6765c793, fa10079d);
- const uint16_t kFive1 = 5;
- const uint16_t kFive2 = kFive1 * 5;
- const uint16_t kFive3 = kFive2 * 5;
- const uint16_t kFive4 = kFive3 * 5;
- const uint16_t kFive5 = kFive4 * 5;
- const uint16_t kFive6 = kFive5 * 5;
- const uint32_t kFive7 = kFive6 * 5;
- const uint32_t kFive8 = kFive7 * 5;
- const uint32_t kFive9 = kFive8 * 5;
- const uint32_t kFive10 = kFive9 * 5;
- const uint32_t kFive11 = kFive10 * 5;
- const uint32_t kFive12 = kFive11 * 5;
- const uint32_t kFive13 = kFive12 * 5;
- const uint32_t kFive1_to_12[] =
- { kFive1, kFive2, kFive3, kFive4, kFive5, kFive6,
- kFive7, kFive8, kFive9, kFive10, kFive11, kFive12 };
-
- ASSERT(exponent >= 0);
- if (exponent == 0) return;
- if (used_digits_ == 0) return;
-
- // We shift by exponent at the end just before returning.
- int remaining_exponent = exponent;
- while (remaining_exponent >= 27) {
- MultiplyByUInt64(kFive27);
- remaining_exponent -= 27;
- }
- while (remaining_exponent >= 13) {
- MultiplyByUInt32(kFive13);
- remaining_exponent -= 13;
- }
- if (remaining_exponent > 0) {
- MultiplyByUInt32(kFive1_to_12[remaining_exponent - 1]);
- }
- ShiftLeft(exponent);
-}
-
-
-void Bignum::Square() {
- ASSERT(IsClamped());
- int product_length = 2 * used_digits_;
- EnsureCapacity(product_length);
-
- // Comba multiplication: compute each column separately.
- // Example: r = a2a1a0 * b2b1b0.
- // r = 1 * a0b0 +
- // 10 * (a1b0 + a0b1) +
- // 100 * (a2b0 + a1b1 + a0b2) +
- // 1000 * (a2b1 + a1b2) +
- // 10000 * a2b2
- //
- // In the worst case we have to accumulate nb-digits products of digit*digit.
- //
- // Assert that the additional number of bits in a DoubleChunk are enough to
- // sum up used_digits of Bigit*Bigit.
- if ((1 << (2 * (kChunkSize - kBigitSize))) <= used_digits_) {
- UNIMPLEMENTED();
- }
- DoubleChunk accumulator = 0;
- // First shift the digits so we don't overwrite them.
- int copy_offset = used_digits_;
- for (int i = 0; i < used_digits_; ++i) {
- bigits_[copy_offset + i] = bigits_[i];
- }
- // We have two loops to avoid some 'if's in the loop.
- for (int i = 0; i < used_digits_; ++i) {
- // Process temporary digit i with power i.
- // The sum of the two indices must be equal to i.
- int bigit_index1 = i;
- int bigit_index2 = 0;
- // Sum all of the sub-products.
- while (bigit_index1 >= 0) {
- Chunk chunk1 = bigits_[copy_offset + bigit_index1];
- Chunk chunk2 = bigits_[copy_offset + bigit_index2];
- accumulator += static_cast<DoubleChunk>(chunk1) * chunk2;
- bigit_index1--;
- bigit_index2++;
- }
- bigits_[i] = static_cast<Chunk>(accumulator) & kBigitMask;
- accumulator >>= kBigitSize;
- }
- for (int i = used_digits_; i < product_length; ++i) {
- int bigit_index1 = used_digits_ - 1;
- int bigit_index2 = i - bigit_index1;
- // Invariant: sum of both indices is again equal to i.
- // Inner loop runs 0 times on last iteration, emptying accumulator.
- while (bigit_index2 < used_digits_) {
- Chunk chunk1 = bigits_[copy_offset + bigit_index1];
- Chunk chunk2 = bigits_[copy_offset + bigit_index2];
- accumulator += static_cast<DoubleChunk>(chunk1) * chunk2;
- bigit_index1--;
- bigit_index2++;
- }
- // The overwritten bigits_[i] will never be read in further loop iterations,
- // because bigit_index1 and bigit_index2 are always greater
- // than i - used_digits_.
- bigits_[i] = static_cast<Chunk>(accumulator) & kBigitMask;
- accumulator >>= kBigitSize;
- }
- // Since the result was guaranteed to lie inside the number the
- // accumulator must be 0 now.
- ASSERT(accumulator == 0);
-
- // Don't forget to update the used_digits and the exponent.
- used_digits_ = product_length;
- exponent_ *= 2;
- Clamp();
-}
-
-
-void Bignum::AssignPowerUInt16(uint16_t base, int power_exponent) {
- ASSERT(base != 0);
- ASSERT(power_exponent >= 0);
- if (power_exponent == 0) {
- AssignUInt16(1);
- return;
- }
- Zero();
- int shifts = 0;
- // We expect base to be in range 2-32, and most often to be 10.
- // It does not make much sense to implement different algorithms for counting
- // the bits.
- while ((base & 1) == 0) {
- base >>= 1;
- shifts++;
- }
- int bit_size = 0;
- int tmp_base = base;
- while (tmp_base != 0) {
- tmp_base >>= 1;
- bit_size++;
- }
- int final_size = bit_size * power_exponent;
- // 1 extra bigit for the shifting, and one for rounded final_size.
- EnsureCapacity(final_size / kBigitSize + 2);
-
- // Left to Right exponentiation.
- int mask = 1;
- while (power_exponent >= mask) mask <<= 1;
-
- // The mask is now pointing to the bit above the most significant 1-bit of
- // power_exponent.
- // Get rid of first 1-bit;
- mask >>= 2;
- uint64_t this_value = base;
-
- bool delayed_multipliciation = false;
- const uint64_t max_32bits = 0xFFFFFFFF;
- while (mask != 0 && this_value <= max_32bits) {
- this_value = this_value * this_value;
- // Verify that there is enough space in this_value to perform the
- // multiplication. The first bit_size bits must be 0.
- if ((power_exponent & mask) != 0) {
- uint64_t base_bits_mask =
- ~((static_cast<uint64_t>(1) << (64 - bit_size)) - 1);
- bool high_bits_zero = (this_value & base_bits_mask) == 0;
- if (high_bits_zero) {
- this_value *= base;
- } else {
- delayed_multipliciation = true;
- }
- }
- mask >>= 1;
- }
- AssignUInt64(this_value);
- if (delayed_multipliciation) {
- MultiplyByUInt32(base);
- }
-
- // Now do the same thing as a bignum.
- while (mask != 0) {
- Square();
- if ((power_exponent & mask) != 0) {
- MultiplyByUInt32(base);
- }
- mask >>= 1;
- }
-
- // And finally add the saved shifts.
- ShiftLeft(shifts * power_exponent);
-}
-
-
-// Precondition: this/other < 16bit.
-uint16_t Bignum::DivideModuloIntBignum(const Bignum& other) {
- ASSERT(IsClamped());
- ASSERT(other.IsClamped());
- ASSERT(other.used_digits_ > 0);
-
- // Easy case: if we have less digits than the divisor than the result is 0.
- // Note: this handles the case where this == 0, too.
- if (BigitLength() < other.BigitLength()) {
- return 0;
- }
-
- Align(other);
-
- uint16_t result = 0;
-
- // Start by removing multiples of 'other' until both numbers have the same
- // number of digits.
- while (BigitLength() > other.BigitLength()) {
- // This naive approach is extremely inefficient if the this divided other
- // might be big. This function is implemented for doubleToString where
- // the result should be small (less than 10).
- ASSERT(other.bigits_[other.used_digits_ - 1] >= ((1 << kBigitSize) / 16));
- // Remove the multiples of the first digit.
- // Example this = 23 and other equals 9. -> Remove 2 multiples.
- result += bigits_[used_digits_ - 1];
- SubtractTimes(other, bigits_[used_digits_ - 1]);
- }
-
- ASSERT(BigitLength() == other.BigitLength());
-
- // Both bignums are at the same length now.
- // Since other has more than 0 digits we know that the access to
- // bigits_[used_digits_ - 1] is safe.
- Chunk this_bigit = bigits_[used_digits_ - 1];
- Chunk other_bigit = other.bigits_[other.used_digits_ - 1];
-
- if (other.used_digits_ == 1) {
- // Shortcut for easy (and common) case.
- int quotient = this_bigit / other_bigit;
- bigits_[used_digits_ - 1] = this_bigit - other_bigit * quotient;
- result += quotient;
- Clamp();
- return result;
- }
-
- int division_estimate = this_bigit / (other_bigit + 1);
- result += division_estimate;
- SubtractTimes(other, division_estimate);
-
- if (other_bigit * (division_estimate + 1) > this_bigit) {
- // No need to even try to subtract. Even if other's remaining digits were 0
- // another subtraction would be too much.
- return result;
- }
-
- while (LessEqual(other, *this)) {
- SubtractBignum(other);
- result++;
- }
- return result;
-}
-
-
-template<typename S>
-static int SizeInHexChars(S number) {
- ASSERT(number > 0);
- int result = 0;
- while (number != 0) {
- number >>= 4;
- result++;
- }
- return result;
-}
-
-
-static char HexCharOfValue(int value) {
- ASSERT(0 <= value && value <= 16);
- if (value < 10) return value + '0';
- return value - 10 + 'A';
-}
-
-
-bool Bignum::ToHexString(char* buffer, int buffer_size) const {
- ASSERT(IsClamped());
- // Each bigit must be printable as separate hex-character.
- ASSERT(kBigitSize % 4 == 0);
- const int kHexCharsPerBigit = kBigitSize / 4;
-
- if (used_digits_ == 0) {
- if (buffer_size < 2) return false;
- buffer[0] = '0';
- buffer[1] = '\0';
- return true;
- }
- // We add 1 for the terminating '\0' character.
- int needed_chars = (BigitLength() - 1) * kHexCharsPerBigit +
- SizeInHexChars(bigits_[used_digits_ - 1]) + 1;
- if (needed_chars > buffer_size) return false;
- int string_index = needed_chars - 1;
- buffer[string_index--] = '\0';
- for (int i = 0; i < exponent_; ++i) {
- for (int j = 0; j < kHexCharsPerBigit; ++j) {
- buffer[string_index--] = '0';
- }
- }
- for (int i = 0; i < used_digits_ - 1; ++i) {
- Chunk current_bigit = bigits_[i];
- for (int j = 0; j < kHexCharsPerBigit; ++j) {
- buffer[string_index--] = HexCharOfValue(current_bigit & 0xF);
- current_bigit >>= 4;
- }
- }
- // And finally the last bigit.
- Chunk most_significant_bigit = bigits_[used_digits_ - 1];
- while (most_significant_bigit != 0) {
- buffer[string_index--] = HexCharOfValue(most_significant_bigit & 0xF);
- most_significant_bigit >>= 4;
- }
- return true;
-}
-
-
-Bignum::Chunk Bignum::BigitAt(int index) const {
- if (index >= BigitLength()) return 0;
- if (index < exponent_) return 0;
- return bigits_[index - exponent_];
-}
-
-
-int Bignum::Compare(const Bignum& a, const Bignum& b) {
- ASSERT(a.IsClamped());
- ASSERT(b.IsClamped());
- int bigit_length_a = a.BigitLength();
- int bigit_length_b = b.BigitLength();
- if (bigit_length_a < bigit_length_b) return -1;
- if (bigit_length_a > bigit_length_b) return +1;
- for (int i = bigit_length_a - 1; i >= Min(a.exponent_, b.exponent_); --i) {
- Chunk bigit_a = a.BigitAt(i);
- Chunk bigit_b = b.BigitAt(i);
- if (bigit_a < bigit_b) return -1;
- if (bigit_a > bigit_b) return +1;
- // Otherwise they are equal up to this digit. Try the next digit.
- }
- return 0;
-}
-
-
-int Bignum::PlusCompare(const Bignum& a, const Bignum& b, const Bignum& c) {
- ASSERT(a.IsClamped());
- ASSERT(b.IsClamped());
- ASSERT(c.IsClamped());
- if (a.BigitLength() < b.BigitLength()) {
- return PlusCompare(b, a, c);
- }
- if (a.BigitLength() + 1 < c.BigitLength()) return -1;
- if (a.BigitLength() > c.BigitLength()) return +1;
- // The exponent encodes 0-bigits. So if there are more 0-digits in 'a' than
- // 'b' has digits, then the bigit-length of 'a'+'b' must be equal to the one
- // of 'a'.
- if (a.exponent_ >= b.BigitLength() && a.BigitLength() < c.BigitLength()) {
- return -1;
- }
-
- Chunk borrow = 0;
- // Starting at min_exponent all digits are == 0. So no need to compare them.
- int min_exponent = Min(Min(a.exponent_, b.exponent_), c.exponent_);
- for (int i = c.BigitLength() - 1; i >= min_exponent; --i) {
- Chunk chunk_a = a.BigitAt(i);
- Chunk chunk_b = b.BigitAt(i);
- Chunk chunk_c = c.BigitAt(i);
- Chunk sum = chunk_a + chunk_b;
- if (sum > chunk_c + borrow) {
- return +1;
- } else {
- borrow = chunk_c + borrow - sum;
- if (borrow > 1) return -1;
- borrow <<= kBigitSize;
- }
- }
- if (borrow == 0) return 0;
- return -1;
-}
-
-
-void Bignum::Clamp() {
- while (used_digits_ > 0 && bigits_[used_digits_ - 1] == 0) {
- used_digits_--;
- }
- if (used_digits_ == 0) {
- // Zero.
- exponent_ = 0;
- }
-}
-
-
-bool Bignum::IsClamped() const {
- return used_digits_ == 0 || bigits_[used_digits_ - 1] != 0;
-}
-
-
-void Bignum::Zero() {
- for (int i = 0; i < used_digits_; ++i) {
- bigits_[i] = 0;
- }
- used_digits_ = 0;
- exponent_ = 0;
-}
-
-
-void Bignum::Align(const Bignum& other) {
- if (exponent_ > other.exponent_) {
- // If "X" represents a "hidden" digit (by the exponent) then we are in the
- // following case (a == this, b == other):
- // a: aaaaaaXXXX or a: aaaaaXXX
- // b: bbbbbbX b: bbbbbbbbXX
- // We replace some of the hidden digits (X) of a with 0 digits.
- // a: aaaaaa000X or a: aaaaa0XX
- int zero_digits = exponent_ - other.exponent_;
- EnsureCapacity(used_digits_ + zero_digits);
- for (int i = used_digits_ - 1; i >= 0; --i) {
- bigits_[i + zero_digits] = bigits_[i];
- }
- for (int i = 0; i < zero_digits; ++i) {
- bigits_[i] = 0;
- }
- used_digits_ += zero_digits;
- exponent_ -= zero_digits;
- ASSERT(used_digits_ >= 0);
- ASSERT(exponent_ >= 0);
- }
-}
-
-
-void Bignum::BigitsShiftLeft(int shift_amount) {
- ASSERT(shift_amount < kBigitSize);
- ASSERT(shift_amount >= 0);
- Chunk carry = 0;
- for (int i = 0; i < used_digits_; ++i) {
- Chunk new_carry = bigits_[i] >> (kBigitSize - shift_amount);
- bigits_[i] = ((bigits_[i] << shift_amount) + carry) & kBigitMask;
- carry = new_carry;
- }
- if (carry != 0) {
- bigits_[used_digits_] = carry;
- used_digits_++;
- }
-}
-
-
-void Bignum::SubtractTimes(const Bignum& other, int factor) {
- ASSERT(exponent_ <= other.exponent_);
- if (factor < 3) {
- for (int i = 0; i < factor; ++i) {
- SubtractBignum(other);
- }
- return;
- }
- Chunk borrow = 0;
- int exponent_diff = other.exponent_ - exponent_;
- for (int i = 0; i < other.used_digits_; ++i) {
- DoubleChunk product = static_cast<DoubleChunk>(factor) * other.bigits_[i];
- DoubleChunk remove = borrow + product;
- Chunk difference = bigits_[i + exponent_diff] - (remove & kBigitMask);
- bigits_[i + exponent_diff] = difference & kBigitMask;
- borrow = static_cast<Chunk>((difference >> (kChunkSize - 1)) +
- (remove >> kBigitSize));
- }
- for (int i = other.used_digits_ + exponent_diff; i < used_digits_; ++i) {
- if (borrow == 0) return;
- Chunk difference = bigits_[i] - borrow;
- bigits_[i] = difference & kBigitMask;
- borrow = difference >> (kChunkSize - 1);
- ++i;
- }
- Clamp();
-}
-
-
-} // namespace double_conversion