You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/04/19 21:33:50 UTC
[02/51] [partial] incubator-joshua git commit: Converted KenLM into a
submodule
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/double-conversion/strtod.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/util/double-conversion/strtod.cc b/ext/kenlm/util/double-conversion/strtod.cc
deleted file mode 100644
index 55b4daa..0000000
--- a/ext/kenlm/util/double-conversion/strtod.cc
+++ /dev/null
@@ -1,558 +0,0 @@
-// Copyright 2010 the V8 project authors. All rights reserved.
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following
-// disclaimer in the documentation and/or other materials provided
-// with the distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived
-// from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#include <cstdarg>
-#include <climits>
-
-#include "strtod.h"
-#include "bignum.h"
-#include "cached-powers.h"
-#include "ieee.h"
-
-namespace double_conversion {
-
-// 2^53 = 9007199254740992.
-// Any integer with at most 15 decimal digits will hence fit into a double
-// (which has a 53bit significand) without loss of precision.
-static const int kMaxExactDoubleIntegerDecimalDigits = 15;
-// 2^64 = 18446744073709551616 > 10^19
-static const int kMaxUint64DecimalDigits = 19;
-
-// Max double: 1.7976931348623157 x 10^308
-// Min non-zero double: 4.9406564584124654 x 10^-324
-// Any x >= 10^309 is interpreted as +infinity.
-// Any x <= 10^-324 is interpreted as 0.
-// Note that 2.5e-324 (despite being smaller than the min double) will be read
-// as non-zero (equal to the min non-zero double).
-static const int kMaxDecimalPower = 309;
-static const int kMinDecimalPower = -324;
-
-// 2^64 = 18446744073709551616
-static const uint64_t kMaxUint64 = UINT64_2PART_C(0xFFFFFFFF, FFFFFFFF);
-
-
-static const double exact_powers_of_ten[] = {
- 1.0, // 10^0
- 10.0,
- 100.0,
- 1000.0,
- 10000.0,
- 100000.0,
- 1000000.0,
- 10000000.0,
- 100000000.0,
- 1000000000.0,
- 10000000000.0, // 10^10
- 100000000000.0,
- 1000000000000.0,
- 10000000000000.0,
- 100000000000000.0,
- 1000000000000000.0,
- 10000000000000000.0,
- 100000000000000000.0,
- 1000000000000000000.0,
- 10000000000000000000.0,
- 100000000000000000000.0, // 10^20
- 1000000000000000000000.0,
- // 10^22 = 0x21e19e0c9bab2400000 = 0x878678326eac9 * 2^22
- 10000000000000000000000.0
-};
-static const int kExactPowersOfTenSize = ARRAY_SIZE(exact_powers_of_ten);
-
-// Maximum number of significant digits in the decimal representation.
-// In fact the value is 772 (see conversions.cc), but to give us some margin
-// we round up to 780.
-static const int kMaxSignificantDecimalDigits = 780;
-
-static Vector<const char> TrimLeadingZeros(Vector<const char> buffer) {
- for (int i = 0; i < buffer.length(); i++) {
- if (buffer[i] != '0') {
- return buffer.SubVector(i, buffer.length());
- }
- }
- return Vector<const char>(buffer.start(), 0);
-}
-
-
-static Vector<const char> TrimTrailingZeros(Vector<const char> buffer) {
- for (int i = buffer.length() - 1; i >= 0; --i) {
- if (buffer[i] != '0') {
- return buffer.SubVector(0, i + 1);
- }
- }
- return Vector<const char>(buffer.start(), 0);
-}
-
-
-static void CutToMaxSignificantDigits(Vector<const char> buffer,
- int exponent,
- char* significant_buffer,
- int* significant_exponent) {
- for (int i = 0; i < kMaxSignificantDecimalDigits - 1; ++i) {
- significant_buffer[i] = buffer[i];
- }
- // The input buffer has been trimmed. Therefore the last digit must be
- // different from '0'.
- ASSERT(buffer[buffer.length() - 1] != '0');
- // Set the last digit to be non-zero. This is sufficient to guarantee
- // correct rounding.
- significant_buffer[kMaxSignificantDecimalDigits - 1] = '1';
- *significant_exponent =
- exponent + (buffer.length() - kMaxSignificantDecimalDigits);
-}
-
-
-// Trims the buffer and cuts it to at most kMaxSignificantDecimalDigits.
-// If possible the input-buffer is reused, but if the buffer needs to be
-// modified (due to cutting), then the input needs to be copied into the
-// buffer_copy_space.
-static void TrimAndCut(Vector<const char> buffer, int exponent,
- char* buffer_copy_space, int space_size,
- Vector<const char>* trimmed, int* updated_exponent) {
- Vector<const char> left_trimmed = TrimLeadingZeros(buffer);
- Vector<const char> right_trimmed = TrimTrailingZeros(left_trimmed);
- exponent += left_trimmed.length() - right_trimmed.length();
- if (right_trimmed.length() > kMaxSignificantDecimalDigits) {
- ASSERT(space_size >= kMaxSignificantDecimalDigits);
- CutToMaxSignificantDigits(right_trimmed, exponent,
- buffer_copy_space, updated_exponent);
- *trimmed = Vector<const char>(buffer_copy_space,
- kMaxSignificantDecimalDigits);
- } else {
- *trimmed = right_trimmed;
- *updated_exponent = exponent;
- }
-}
-
-
-// Reads digits from the buffer and converts them to a uint64.
-// Reads in as many digits as fit into a uint64.
-// When the string starts with "1844674407370955161" no further digit is read.
-// Since 2^64 = 18446744073709551616 it would still be possible read another
-// digit if it was less or equal than 6, but this would complicate the code.
-static uint64_t ReadUint64(Vector<const char> buffer,
- int* number_of_read_digits) {
- uint64_t result = 0;
- int i = 0;
- while (i < buffer.length() && result <= (kMaxUint64 / 10 - 1)) {
- int digit = buffer[i++] - '0';
- ASSERT(0 <= digit && digit <= 9);
- result = 10 * result + digit;
- }
- *number_of_read_digits = i;
- return result;
-}
-
-
-// Reads a DiyFp from the buffer.
-// The returned DiyFp is not necessarily normalized.
-// If remaining_decimals is zero then the returned DiyFp is accurate.
-// Otherwise it has been rounded and has error of at most 1/2 ulp.
-static void ReadDiyFp(Vector<const char> buffer,
- DiyFp* result,
- int* remaining_decimals) {
- int read_digits;
- uint64_t significand = ReadUint64(buffer, &read_digits);
- if (buffer.length() == read_digits) {
- *result = DiyFp(significand, 0);
- *remaining_decimals = 0;
- } else {
- // Round the significand.
- if (buffer[read_digits] >= '5') {
- significand++;
- }
- // Compute the binary exponent.
- int exponent = 0;
- *result = DiyFp(significand, exponent);
- *remaining_decimals = buffer.length() - read_digits;
- }
-}
-
-
-static bool DoubleStrtod(Vector<const char> trimmed,
- int exponent,
- double* result) {
-#if !defined(DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS)
- // On x86 the floating-point stack can be 64 or 80 bits wide. If it is
- // 80 bits wide (as is the case on Linux) then double-rounding occurs and the
- // result is not accurate.
- // We know that Windows32 uses 64 bits and is therefore accurate.
- // Note that the ARM simulator is compiled for 32bits. It therefore exhibits
- // the same problem.
- return false;
-#endif
- if (trimmed.length() <= kMaxExactDoubleIntegerDecimalDigits) {
- int read_digits;
- // The trimmed input fits into a double.
- // If the 10^exponent (resp. 10^-exponent) fits into a double too then we
- // can compute the result-double simply by multiplying (resp. dividing) the
- // two numbers.
- // This is possible because IEEE guarantees that floating-point operations
- // return the best possible approximation.
- if (exponent < 0 && -exponent < kExactPowersOfTenSize) {
- // 10^-exponent fits into a double.
- *result = static_cast<double>(ReadUint64(trimmed, &read_digits));
- ASSERT(read_digits == trimmed.length());
- *result /= exact_powers_of_ten[-exponent];
- return true;
- }
- if (0 <= exponent && exponent < kExactPowersOfTenSize) {
- // 10^exponent fits into a double.
- *result = static_cast<double>(ReadUint64(trimmed, &read_digits));
- ASSERT(read_digits == trimmed.length());
- *result *= exact_powers_of_ten[exponent];
- return true;
- }
- int remaining_digits =
- kMaxExactDoubleIntegerDecimalDigits - trimmed.length();
- if ((0 <= exponent) &&
- (exponent - remaining_digits < kExactPowersOfTenSize)) {
- // The trimmed string was short and we can multiply it with
- // 10^remaining_digits. As a result the remaining exponent now fits
- // into a double too.
- *result = static_cast<double>(ReadUint64(trimmed, &read_digits));
- ASSERT(read_digits == trimmed.length());
- *result *= exact_powers_of_ten[remaining_digits];
- *result *= exact_powers_of_ten[exponent - remaining_digits];
- return true;
- }
- }
- return false;
-}
-
-
-// Returns 10^exponent as an exact DiyFp.
-// The given exponent must be in the range [1; kDecimalExponentDistance[.
-static DiyFp AdjustmentPowerOfTen(int exponent) {
- ASSERT(0 < exponent);
- ASSERT(exponent < PowersOfTenCache::kDecimalExponentDistance);
- // Simply hardcode the remaining powers for the given decimal exponent
- // distance.
- ASSERT(PowersOfTenCache::kDecimalExponentDistance == 8);
- switch (exponent) {
- case 1: return DiyFp(UINT64_2PART_C(0xa0000000, 00000000), -60);
- case 2: return DiyFp(UINT64_2PART_C(0xc8000000, 00000000), -57);
- case 3: return DiyFp(UINT64_2PART_C(0xfa000000, 00000000), -54);
- case 4: return DiyFp(UINT64_2PART_C(0x9c400000, 00000000), -50);
- case 5: return DiyFp(UINT64_2PART_C(0xc3500000, 00000000), -47);
- case 6: return DiyFp(UINT64_2PART_C(0xf4240000, 00000000), -44);
- case 7: return DiyFp(UINT64_2PART_C(0x98968000, 00000000), -40);
- default:
- UNREACHABLE();
- return DiyFp(0, 0);
- }
-}
-
-
-// If the function returns true then the result is the correct double.
-// Otherwise it is either the correct double or the double that is just below
-// the correct double.
-static bool DiyFpStrtod(Vector<const char> buffer,
- int exponent,
- double* result) {
- DiyFp input;
- int remaining_decimals;
- ReadDiyFp(buffer, &input, &remaining_decimals);
- // Since we may have dropped some digits the input is not accurate.
- // If remaining_decimals is different than 0 than the error is at most
- // .5 ulp (unit in the last place).
- // We don't want to deal with fractions and therefore keep a common
- // denominator.
- const int kDenominatorLog = 3;
- const int kDenominator = 1 << kDenominatorLog;
- // Move the remaining decimals into the exponent.
- exponent += remaining_decimals;
- int error = (remaining_decimals == 0 ? 0 : kDenominator / 2);
-
- int old_e = input.e();
- input.Normalize();
- error <<= old_e - input.e();
-
- ASSERT(exponent <= PowersOfTenCache::kMaxDecimalExponent);
- if (exponent < PowersOfTenCache::kMinDecimalExponent) {
- *result = 0.0;
- return true;
- }
- DiyFp cached_power;
- int cached_decimal_exponent;
- PowersOfTenCache::GetCachedPowerForDecimalExponent(exponent,
- &cached_power,
- &cached_decimal_exponent);
-
- if (cached_decimal_exponent != exponent) {
- int adjustment_exponent = exponent - cached_decimal_exponent;
- DiyFp adjustment_power = AdjustmentPowerOfTen(adjustment_exponent);
- input.Multiply(adjustment_power);
- if (kMaxUint64DecimalDigits - buffer.length() >= adjustment_exponent) {
- // The product of input with the adjustment power fits into a 64 bit
- // integer.
- ASSERT(DiyFp::kSignificandSize == 64);
- } else {
- // The adjustment power is exact. There is hence only an error of 0.5.
- error += kDenominator / 2;
- }
- }
-
- input.Multiply(cached_power);
- // The error introduced by a multiplication of a*b equals
- // error_a + error_b + error_a*error_b/2^64 + 0.5
- // Substituting a with 'input' and b with 'cached_power' we have
- // error_b = 0.5 (all cached powers have an error of less than 0.5 ulp),
- // error_ab = 0 or 1 / kDenominator > error_a*error_b/ 2^64
- int error_b = kDenominator / 2;
- int error_ab = (error == 0 ? 0 : 1); // We round up to 1.
- int fixed_error = kDenominator / 2;
- error += error_b + error_ab + fixed_error;
-
- old_e = input.e();
- input.Normalize();
- error <<= old_e - input.e();
-
- // See if the double's significand changes if we add/subtract the error.
- int order_of_magnitude = DiyFp::kSignificandSize + input.e();
- int effective_significand_size =
- Double::SignificandSizeForOrderOfMagnitude(order_of_magnitude);
- int precision_digits_count =
- DiyFp::kSignificandSize - effective_significand_size;
- if (precision_digits_count + kDenominatorLog >= DiyFp::kSignificandSize) {
- // This can only happen for very small denormals. In this case the
- // half-way multiplied by the denominator exceeds the range of an uint64.
- // Simply shift everything to the right.
- int shift_amount = (precision_digits_count + kDenominatorLog) -
- DiyFp::kSignificandSize + 1;
- input.set_f(input.f() >> shift_amount);
- input.set_e(input.e() + shift_amount);
- // We add 1 for the lost precision of error, and kDenominator for
- // the lost precision of input.f().
- error = (error >> shift_amount) + 1 + kDenominator;
- precision_digits_count -= shift_amount;
- }
- // We use uint64_ts now. This only works if the DiyFp uses uint64_ts too.
- ASSERT(DiyFp::kSignificandSize == 64);
- ASSERT(precision_digits_count < 64);
- uint64_t one64 = 1;
- uint64_t precision_bits_mask = (one64 << precision_digits_count) - 1;
- uint64_t precision_bits = input.f() & precision_bits_mask;
- uint64_t half_way = one64 << (precision_digits_count - 1);
- precision_bits *= kDenominator;
- half_way *= kDenominator;
- DiyFp rounded_input(input.f() >> precision_digits_count,
- input.e() + precision_digits_count);
- if (precision_bits >= half_way + error) {
- rounded_input.set_f(rounded_input.f() + 1);
- }
- // If the last_bits are too close to the half-way case than we are too
- // inaccurate and round down. In this case we return false so that we can
- // fall back to a more precise algorithm.
-
- *result = Double(rounded_input).value();
- if (half_way - error < precision_bits && precision_bits < half_way + error) {
- // Too imprecise. The caller will have to fall back to a slower version.
- // However the returned number is guaranteed to be either the correct
- // double, or the next-lower double.
- return false;
- } else {
- return true;
- }
-}
-
-
-// Returns
-// - -1 if buffer*10^exponent < diy_fp.
-// - 0 if buffer*10^exponent == diy_fp.
-// - +1 if buffer*10^exponent > diy_fp.
-// Preconditions:
-// buffer.length() + exponent <= kMaxDecimalPower + 1
-// buffer.length() + exponent > kMinDecimalPower
-// buffer.length() <= kMaxDecimalSignificantDigits
-static int CompareBufferWithDiyFp(Vector<const char> buffer,
- int exponent,
- DiyFp diy_fp) {
- ASSERT(buffer.length() + exponent <= kMaxDecimalPower + 1);
- ASSERT(buffer.length() + exponent > kMinDecimalPower);
- ASSERT(buffer.length() <= kMaxSignificantDecimalDigits);
- // Make sure that the Bignum will be able to hold all our numbers.
- // Our Bignum implementation has a separate field for exponents. Shifts will
- // consume at most one bigit (< 64 bits).
- // ln(10) == 3.3219...
- ASSERT(((kMaxDecimalPower + 1) * 333 / 100) < Bignum::kMaxSignificantBits);
- Bignum buffer_bignum;
- Bignum diy_fp_bignum;
- buffer_bignum.AssignDecimalString(buffer);
- diy_fp_bignum.AssignUInt64(diy_fp.f());
- if (exponent >= 0) {
- buffer_bignum.MultiplyByPowerOfTen(exponent);
- } else {
- diy_fp_bignum.MultiplyByPowerOfTen(-exponent);
- }
- if (diy_fp.e() > 0) {
- diy_fp_bignum.ShiftLeft(diy_fp.e());
- } else {
- buffer_bignum.ShiftLeft(-diy_fp.e());
- }
- return Bignum::Compare(buffer_bignum, diy_fp_bignum);
-}
-
-
-// Returns true if the guess is the correct double.
-// Returns false, when guess is either correct or the next-lower double.
-static bool ComputeGuess(Vector<const char> trimmed, int exponent,
- double* guess) {
- if (trimmed.length() == 0) {
- *guess = 0.0;
- return true;
- }
- if (exponent + trimmed.length() - 1 >= kMaxDecimalPower) {
- *guess = Double::Infinity();
- return true;
- }
- if (exponent + trimmed.length() <= kMinDecimalPower) {
- *guess = 0.0;
- return true;
- }
-
- if (DoubleStrtod(trimmed, exponent, guess) ||
- DiyFpStrtod(trimmed, exponent, guess)) {
- return true;
- }
- if (*guess == Double::Infinity()) {
- return true;
- }
- return false;
-}
-
-double Strtod(Vector<const char> buffer, int exponent) {
- char copy_buffer[kMaxSignificantDecimalDigits];
- Vector<const char> trimmed;
- int updated_exponent;
- TrimAndCut(buffer, exponent, copy_buffer, kMaxSignificantDecimalDigits,
- &trimmed, &updated_exponent);
- exponent = updated_exponent;
-
- double guess;
- bool is_correct = ComputeGuess(trimmed, exponent, &guess);
- if (is_correct) return guess;
-
- DiyFp upper_boundary = Double(guess).UpperBoundary();
- int comparison = CompareBufferWithDiyFp(trimmed, exponent, upper_boundary);
- if (comparison < 0) {
- return guess;
- } else if (comparison > 0) {
- return Double(guess).NextDouble();
- } else if ((Double(guess).Significand() & 1) == 0) {
- // Round towards even.
- return guess;
- } else {
- return Double(guess).NextDouble();
- }
-}
-
-float Strtof(Vector<const char> buffer, int exponent) {
- char copy_buffer[kMaxSignificantDecimalDigits];
- Vector<const char> trimmed;
- int updated_exponent;
- TrimAndCut(buffer, exponent, copy_buffer, kMaxSignificantDecimalDigits,
- &trimmed, &updated_exponent);
- exponent = updated_exponent;
-
- double double_guess;
- bool is_correct = ComputeGuess(trimmed, exponent, &double_guess);
-
- float float_guess = static_cast<float>(double_guess);
- if (float_guess == double_guess) {
- // This shortcut triggers for integer values.
- return float_guess;
- }
-
- // We must catch double-rounding. Say the double has been rounded up, and is
- // now a boundary of a float, and rounds up again. This is why we have to
- // look at previous too.
- // Example (in decimal numbers):
- // input: 12349
- // high-precision (4 digits): 1235
- // low-precision (3 digits):
- // when read from input: 123
- // when rounded from high precision: 124.
- // To do this we simply look at the neigbors of the correct result and see
- // if they would round to the same float. If the guess is not correct we have
- // to look at four values (since two different doubles could be the correct
- // double).
-
- double double_next = Double(double_guess).NextDouble();
- double double_previous = Double(double_guess).PreviousDouble();
-
- float f1 = static_cast<float>(double_previous);
-#ifndef NDEBUG
- float f2 = float_guess;
-#endif
- float f3 = static_cast<float>(double_next);
- float f4;
- if (is_correct) {
- f4 = f3;
- } else {
- double double_next2 = Double(double_next).NextDouble();
- f4 = static_cast<float>(double_next2);
- }
-#ifndef NDEBUG
- ASSERT(f1 <= f2 && f2 <= f3 && f3 <= f4);
-#endif
-
- // If the guess doesn't lie near a single-precision boundary we can simply
- // return its float-value.
- if (f1 == f4) {
- return float_guess;
- }
-
- ASSERT((f1 != f2 && f2 == f3 && f3 == f4) ||
- (f1 == f2 && f2 != f3 && f3 == f4) ||
- (f1 == f2 && f2 == f3 && f3 != f4));
-
- // guess and next are the two possible canditates (in the same way that
- // double_guess was the lower candidate for a double-precision guess).
- float guess = f1;
- float next = f4;
- DiyFp upper_boundary;
- if (guess == 0.0f) {
- float min_float = 1e-45f;
- upper_boundary = Double(static_cast<double>(min_float) / 2).AsDiyFp();
- } else {
- upper_boundary = Single(guess).UpperBoundary();
- }
- int comparison = CompareBufferWithDiyFp(trimmed, exponent, upper_boundary);
- if (comparison < 0) {
- return guess;
- } else if (comparison > 0) {
- return next;
- } else if ((Single(guess).Significand() & 1) == 0) {
- // Round towards even.
- return guess;
- } else {
- return next;
- }
-}
-
-} // namespace double_conversion
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/double-conversion/strtod.h
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/util/double-conversion/strtod.h b/ext/kenlm/util/double-conversion/strtod.h
deleted file mode 100644
index ed0293b..0000000
--- a/ext/kenlm/util/double-conversion/strtod.h
+++ /dev/null
@@ -1,45 +0,0 @@
-// Copyright 2010 the V8 project authors. All rights reserved.
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following
-// disclaimer in the documentation and/or other materials provided
-// with the distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived
-// from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#ifndef DOUBLE_CONVERSION_STRTOD_H_
-#define DOUBLE_CONVERSION_STRTOD_H_
-
-#include "utils.h"
-
-namespace double_conversion {
-
-// The buffer must only contain digits in the range [0-9]. It must not
-// contain a dot or a sign. It must not start with '0', and must not be empty.
-double Strtod(Vector<const char> buffer, int exponent);
-
-// The buffer must only contain digits in the range [0-9]. It must not
-// contain a dot or a sign. It must not start with '0', and must not be empty.
-float Strtof(Vector<const char> buffer, int exponent);
-
-} // namespace double_conversion
-
-#endif // DOUBLE_CONVERSION_STRTOD_H_
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/double-conversion/utils.h
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/util/double-conversion/utils.h b/ext/kenlm/util/double-conversion/utils.h
deleted file mode 100644
index 9ccb3b6..0000000
--- a/ext/kenlm/util/double-conversion/utils.h
+++ /dev/null
@@ -1,320 +0,0 @@
-// Copyright 2010 the V8 project authors. All rights reserved.
-// Redistribution and use in source and binary forms, with or without
-// modification, are permitted provided that the following conditions are
-// met:
-//
-// * Redistributions of source code must retain the above copyright
-// notice, this list of conditions and the following disclaimer.
-// * Redistributions in binary form must reproduce the above
-// copyright notice, this list of conditions and the following
-// disclaimer in the documentation and/or other materials provided
-// with the distribution.
-// * Neither the name of Google Inc. nor the names of its
-// contributors may be used to endorse or promote products derived
-// from this software without specific prior written permission.
-//
-// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-#ifndef DOUBLE_CONVERSION_UTILS_H_
-#define DOUBLE_CONVERSION_UTILS_H_
-
-#include <stdlib.h>
-#include <string.h>
-
-#include <assert.h>
-#ifndef ASSERT
-#define ASSERT(condition) (assert(condition))
-#endif
-#ifndef UNIMPLEMENTED
-#define UNIMPLEMENTED() (abort())
-#endif
-#ifndef UNREACHABLE
-#define UNREACHABLE() (abort())
-#endif
-
-// Double operations detection based on target architecture.
-// Linux uses a 80bit wide floating point stack on x86. This induces double
-// rounding, which in turn leads to wrong results.
-// An easy way to test if the floating-point operations are correct is to
-// evaluate: 89255.0/1e22. If the floating-point stack is 64 bits wide then
-// the result is equal to 89255e-22.
-// The best way to test this, is to create a division-function and to compare
-// the output of the division with the expected result. (Inlining must be
-// disabled.)
-// On Linux,x86 89255e-22 != Div_double(89255.0/1e22)
-#if defined(_M_X64) || defined(__x86_64__) || \
- defined(__ARMEL__) || defined(__avr32__) || \
- defined(__hppa__) || defined(__ia64__) || \
- defined(__mips__) || defined(__powerpc__) || \
- defined(__sparc__) || defined(__sparc) || defined(__s390__) || \
- defined(__SH4__) || defined(__alpha__) || \
- defined(_MIPS_ARCH_MIPS32R2)
-#define DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS 1
-#elif defined(_M_IX86) || defined(__i386__) || defined(__i386)
-#if defined(_WIN32)
-// Windows uses a 64bit wide floating point stack.
-#define DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS 1
-#else
-#undef DOUBLE_CONVERSION_CORRECT_DOUBLE_OPERATIONS
-#endif // _WIN32
-#else
-#error Target architecture was not detected as supported by Double-Conversion.
-#endif
-
-
-#if defined(_WIN32) && !defined(__MINGW32__)
-
-typedef signed char int8_t;
-typedef unsigned char uint8_t;
-typedef short int16_t; // NOLINT
-typedef unsigned short uint16_t; // NOLINT
-typedef int int32_t;
-typedef unsigned int uint32_t;
-typedef __int64 int64_t;
-typedef unsigned __int64 uint64_t;
-// intptr_t and friends are defined in crtdefs.h through stdio.h.
-
-#else
-
-#include <stdint.h>
-
-#endif
-
-// The following macro works on both 32 and 64-bit platforms.
-// Usage: instead of writing 0x1234567890123456
-// write UINT64_2PART_C(0x12345678,90123456);
-#define UINT64_2PART_C(a, b) (((static_cast<uint64_t>(a) << 32) + 0x##b##u))
-
-
-// The expression ARRAY_SIZE(a) is a compile-time constant of type
-// size_t which represents the number of elements of the given
-// array. You should only use ARRAY_SIZE on statically allocated
-// arrays.
-#ifndef ARRAY_SIZE
-#define ARRAY_SIZE(a) \
- ((sizeof(a) / sizeof(*(a))) / \
- static_cast<size_t>(!(sizeof(a) % sizeof(*(a)))))
-#endif
-
-// A macro to disallow the evil copy constructor and operator= functions
-// This should be used in the private: declarations for a class
-#ifndef DISALLOW_COPY_AND_ASSIGN
-#define DISALLOW_COPY_AND_ASSIGN(TypeName) \
- TypeName(const TypeName&); \
- void operator=(const TypeName&)
-#endif
-
-// A macro to disallow all the implicit constructors, namely the
-// default constructor, copy constructor and operator= functions.
-//
-// This should be used in the private: declarations for a class
-// that wants to prevent anyone from instantiating it. This is
-// especially useful for classes containing only static methods.
-#ifndef DISALLOW_IMPLICIT_CONSTRUCTORS
-#define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName) \
- TypeName(); \
- DISALLOW_COPY_AND_ASSIGN(TypeName)
-#endif
-
-namespace double_conversion {
-
-static const int kCharSize = sizeof(char);
-
-// Returns the maximum of the two parameters.
-template <typename T>
-static T Max(T a, T b) {
- return a < b ? b : a;
-}
-
-
-// Returns the minimum of the two parameters.
-template <typename T>
-static T Min(T a, T b) {
- return a < b ? a : b;
-}
-
-
-inline int StrLength(const char* string) {
- size_t length = strlen(string);
- ASSERT(length == static_cast<size_t>(static_cast<int>(length)));
- return static_cast<int>(length);
-}
-
-// This is a simplified version of V8's Vector class.
-template <typename T>
-class Vector {
- public:
- Vector() : start_(NULL), length_(0) {}
- Vector(T* data, int length) : start_(data), length_(length) {
- ASSERT(length == 0 || (length > 0 && data != NULL));
- }
-
- // Returns a vector using the same backing storage as this one,
- // spanning from and including 'from', to but not including 'to'.
- Vector<T> SubVector(int from, int to) {
- ASSERT(to <= length_);
- ASSERT(from < to);
- ASSERT(0 <= from);
- return Vector<T>(start() + from, to - from);
- }
-
- // Returns the length of the vector.
- int length() const { return length_; }
-
- // Returns whether or not the vector is empty.
- bool is_empty() const { return length_ == 0; }
-
- // Returns the pointer to the start of the data in the vector.
- T* start() const { return start_; }
-
- // Access individual vector elements - checks bounds in debug mode.
- T& operator[](int index) const {
- ASSERT(0 <= index && index < length_);
- return start_[index];
- }
-
- T& first() { return start_[0]; }
-
- T& last() { return start_[length_ - 1]; }
-
- private:
- T* start_;
- int length_;
-};
-
-
-// Helper class for building result strings in a character buffer. The
-// purpose of the class is to use safe operations that checks the
-// buffer bounds on all operations in debug mode.
-class StringBuilder {
- public:
- StringBuilder(char* buffer, int size)
- : buffer_(buffer, size), position_(0) { }
-
- ~StringBuilder() { if (!is_finalized()) Finalize(); }
-
- int size() const { return buffer_.length(); }
-
- // Get the current position in the builder.
- int position() const {
- ASSERT(!is_finalized());
- return position_;
- }
-
- // Reset the position.
- void Reset() { position_ = 0; }
-
- // Add a single character to the builder. It is not allowed to add
- // 0-characters; use the Finalize() method to terminate the string
- // instead.
- void AddCharacter(char c) {
- // I just extract raw data not a cstr so null is fine.
- //ASSERT(c != '\0');
- ASSERT(!is_finalized() && position_ < buffer_.length());
- buffer_[position_++] = c;
- }
-
- // Add an entire string to the builder. Uses strlen() internally to
- // compute the length of the input string.
- void AddString(const char* s) {
- AddSubstring(s, StrLength(s));
- }
-
- // Add the first 'n' characters of the given string 's' to the
- // builder. The input string must have enough characters.
- void AddSubstring(const char* s, int n) {
- ASSERT(!is_finalized() && position_ + n < buffer_.length());
- // I just extract raw data not a cstr so null is fine.
- //ASSERT(static_cast<size_t>(n) <= strlen(s));
- memmove(&buffer_[position_], s, n * kCharSize);
- position_ += n;
- }
-
-
- // Add character padding to the builder. If count is non-positive,
- // nothing is added to the builder.
- void AddPadding(char c, int count) {
- for (int i = 0; i < count; i++) {
- AddCharacter(c);
- }
- }
-
- // Finalize the string by 0-terminating it and returning the buffer.
- char* Finalize() {
- ASSERT(!is_finalized() && position_ < buffer_.length());
- buffer_[position_] = '\0';
- // Make sure nobody managed to add a 0-character to the
- // buffer while building the string.
- // I just extract raw data not a cstr so null is fine.
- //ASSERT(strlen(buffer_.start()) == static_cast<size_t>(position_));
- position_ = -1;
- ASSERT(is_finalized());
- return buffer_.start();
- }
-
- private:
- Vector<char> buffer_;
- int position_;
-
- bool is_finalized() const { return position_ < 0; }
-
- DISALLOW_IMPLICIT_CONSTRUCTORS(StringBuilder);
-};
-
-// The type-based aliasing rule allows the compiler to assume that pointers of
-// different types (for some definition of different) never alias each other.
-// Thus the following code does not work:
-//
-// float f = foo();
-// int fbits = *(int*)(&f);
-//
-// The compiler 'knows' that the int pointer can't refer to f since the types
-// don't match, so the compiler may cache f in a register, leaving random data
-// in fbits. Using C++ style casts makes no difference, however a pointer to
-// char data is assumed to alias any other pointer. This is the 'memcpy
-// exception'.
-//
-// Bit_cast uses the memcpy exception to move the bits from a variable of one
-// type of a variable of another type. Of course the end result is likely to
-// be implementation dependent. Most compilers (gcc-4.2 and MSVC 2005)
-// will completely optimize BitCast away.
-//
-// There is an additional use for BitCast.
-// Recent gccs will warn when they see casts that may result in breakage due to
-// the type-based aliasing rule. If you have checked that there is no breakage
-// you can use BitCast to cast one pointer type to another. This confuses gcc
-// enough that it can no longer see that you have cast one pointer type to
-// another thus avoiding the warning.
-template <class Dest, class Source>
-inline Dest BitCast(const Source& source) {
- // Compile time assertion: sizeof(Dest) == sizeof(Source)
- // A compile error here means your Dest and Source have different sizes.
- typedef char VerifySizesAreEqual[sizeof(Dest) == sizeof(Source) ? 1 : -1]
-#if __GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 8
- __attribute__((unused))
-#endif
- ;
-
- Dest dest;
- memmove(&dest, &source, sizeof(dest));
- return dest;
-}
-
-template <class Dest, class Source>
-inline Dest BitCast(Source* source) {
- return BitCast<Dest>(reinterpret_cast<uintptr_t>(source));
-}
-
-} // namespace double_conversion
-
-#endif // DOUBLE_CONVERSION_UTILS_H_
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/ersatz_progress.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/util/ersatz_progress.cc b/ext/kenlm/util/ersatz_progress.cc
deleted file mode 100644
index 55c82e7..0000000
--- a/ext/kenlm/util/ersatz_progress.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-#include "util/ersatz_progress.hh"
-
-#include <algorithm>
-#include <ostream>
-#include <limits>
-#include <string>
-
-namespace util {
-
-namespace { const unsigned char kWidth = 100; }
-
-const char kProgressBanner[] = "----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100\n";
-
-ErsatzProgress::ErsatzProgress() : current_(0), next_(std::numeric_limits<uint64_t>::max()), complete_(next_), out_(NULL) {}
-
-ErsatzProgress::~ErsatzProgress() {
- if (out_) Finished();
-}
-
-ErsatzProgress::ErsatzProgress(uint64_t complete, std::ostream *to, const std::string &message)
- : current_(0), next_(complete / kWidth), complete_(complete), stones_written_(0), out_(to) {
- if (!out_) {
- next_ = std::numeric_limits<uint64_t>::max();
- return;
- }
- if (!message.empty()) *out_ << message << '\n';
- *out_ << kProgressBanner;
-}
-
-void ErsatzProgress::Milestone() {
- if (!out_) { current_ = 0; return; }
- if (!complete_) return;
- unsigned char stone = std::min(static_cast<uint64_t>(kWidth), (current_ * kWidth) / complete_);
-
- for (; stones_written_ < stone; ++stones_written_) {
- (*out_) << '*';
- }
- if (stone == kWidth) {
- (*out_) << std::endl;
- next_ = std::numeric_limits<uint64_t>::max();
- out_ = NULL;
- } else {
- next_ = std::max(next_, ((stone + 1) * complete_ + kWidth - 1) / kWidth);
- }
-}
-
-} // namespace util
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/ersatz_progress.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/util/ersatz_progress.hh b/ext/kenlm/util/ersatz_progress.hh
deleted file mode 100644
index b47aded..0000000
--- a/ext/kenlm/util/ersatz_progress.hh
+++ /dev/null
@@ -1,57 +0,0 @@
-#ifndef UTIL_ERSATZ_PROGRESS_H
-#define UTIL_ERSATZ_PROGRESS_H
-
-#include <iostream>
-#include <string>
-#include <stdint.h>
-
-// Ersatz version of boost::progress so core language model doesn't depend on
-// boost. Also adds option to print nothing.
-
-namespace util {
-
-extern const char kProgressBanner[];
-
-class ErsatzProgress {
- public:
- // No output.
- ErsatzProgress();
-
- // Null means no output. The null value is useful for passing along the ostream pointer from another caller.
- explicit ErsatzProgress(uint64_t complete, std::ostream *to = &std::cerr, const std::string &message = "");
-
- ~ErsatzProgress();
-
- ErsatzProgress &operator++() {
- if (++current_ >= next_) Milestone();
- return *this;
- }
-
- ErsatzProgress &operator+=(uint64_t amount) {
- if ((current_ += amount) >= next_) Milestone();
- return *this;
- }
-
- void Set(uint64_t to) {
- if ((current_ = to) >= next_) Milestone();
- }
-
- void Finished() {
- Set(complete_);
- }
-
- private:
- void Milestone();
-
- uint64_t current_, next_, complete_;
- unsigned char stones_written_;
- std::ostream *out_;
-
- // noncopyable
- ErsatzProgress(const ErsatzProgress &other);
- ErsatzProgress &operator=(const ErsatzProgress &other);
-};
-
-} // namespace util
-
-#endif // UTIL_ERSATZ_PROGRESS_H
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/exception.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/util/exception.cc b/ext/kenlm/util/exception.cc
deleted file mode 100644
index e644d2c..0000000
--- a/ext/kenlm/util/exception.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-#include "util/exception.hh"
-
-#ifdef __GXX_RTTI
-#include <typeinfo>
-#endif
-
-#include <cerrno>
-#include <cstring>
-
-#if defined(_WIN32) || defined(_WIN64)
-#include <windows.h>
-#include <io.h>
-#endif
-
-namespace util {
-
-Exception::Exception() throw() {}
-Exception::~Exception() throw() {}
-
-void Exception::SetLocation(const char *file, unsigned int line, const char *func, const char *child_name, const char *condition) {
- /* The child class might have set some text, but we want this to come first.
- * Another option would be passing this information to the constructor, but
- * then child classes would have to accept constructor arguments and pass
- * them down.
- */
- std::string old_text;
- std::swap(old_text, what_);
- StringStream stream(what_);
- stream << file << ':' << line;
- if (func) stream << " in " << func << " threw ";
- if (child_name) {
- stream << child_name;
- } else {
-#ifdef __GXX_RTTI
- stream << typeid(this).name();
-#else
- stream << "an exception";
-#endif
- }
- if (condition) {
- stream << " because `" << condition << '\'';
- }
- stream << ".\n";
- stream << old_text;
-}
-
-namespace {
-
-#ifdef __GNUC__
-const char *HandleStrerror(int ret, const char *buf) __attribute__ ((unused));
-const char *HandleStrerror(const char *ret, const char * /*buf*/) __attribute__ ((unused));
-#endif
-// At least one of these functions will not be called.
-#ifdef __clang__
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wunused-function"
-#endif
-// The XOPEN version.
-const char *HandleStrerror(int ret, const char *buf) {
- if (!ret) return buf;
- return NULL;
-}
-
-// The GNU version.
-const char *HandleStrerror(const char *ret, const char * /*buf*/) {
- return ret;
-}
-#ifdef __clang__
-#pragma clang diagnostic pop
-#endif
-} // namespace
-
-ErrnoException::ErrnoException() throw() : errno_(errno) {
- char buf[200];
- buf[0] = 0;
-#if defined(sun) || defined(_WIN32) || defined(_WIN64)
- const char *add = strerror(errno);
-#else
- const char *add = HandleStrerror(strerror_r(errno, buf, 200), buf);
-#endif
-
- if (add) {
- *this << add << ' ';
- }
-}
-
-ErrnoException::~ErrnoException() throw() {}
-
-OverflowException::OverflowException() throw() {}
-OverflowException::~OverflowException() throw() {}
-
-#if defined(_WIN32) || defined(_WIN64)
-WindowsException::WindowsException() throw() {
- unsigned int last_error = GetLastError();
- char error_msg[256] = "";
- if (!FormatMessageA(FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, last_error, LANG_NEUTRAL, error_msg, sizeof(error_msg), NULL)) {
- *this << "Windows error " << GetLastError() << " while formatting Windows error " << last_error << ". ";
- } else {
- *this << "Windows error " << last_error << ": " << error_msg;
- }
-}
-WindowsException::~WindowsException() throw() {}
-#endif
-
-} // namespace util
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/exception.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/util/exception.hh b/ext/kenlm/util/exception.hh
deleted file mode 100644
index 57d803d..0000000
--- a/ext/kenlm/util/exception.hh
+++ /dev/null
@@ -1,159 +0,0 @@
-#ifndef UTIL_EXCEPTION_H
-#define UTIL_EXCEPTION_H
-
-#include "util/string_stream.hh"
-
-#include <exception>
-#include <limits>
-#include <string>
-#include <stdint.h>
-
-namespace util {
-
-template <class Except, class Data> typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data);
-
-class Exception : public std::exception {
- public:
- Exception() throw();
- virtual ~Exception() throw();
-
- const char *what() const throw() { return what_.c_str(); }
-
- // For use by the UTIL_THROW macros.
- void SetLocation(
- const char *file,
- unsigned int line,
- const char *func,
- const char *child_name,
- const char *condition);
-
- private:
- template <class Except, class Data> friend typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data);
-
- // This helps restrict operator<< defined below.
- template <class T> struct ExceptionTag {
- typedef T Identity;
- };
-
- std::string what_;
-};
-
-/* This implements the normal operator<< for Exception and all its children.
- * SFINAE means it only applies to Exception. Think of this as an ersatz
- * boost::enable_if.
- */
-template <class Except, class Data> typename Except::template ExceptionTag<Except&>::Identity operator<<(Except &e, const Data &data) {
- StringStream(e.what_) << data;
- return e;
-}
-
-#ifdef __GNUC__
-#define UTIL_FUNC_NAME __PRETTY_FUNCTION__
-#else
-#ifdef _WIN32
-#define UTIL_FUNC_NAME __FUNCTION__
-#else
-#define UTIL_FUNC_NAME NULL
-#endif
-#endif
-
-/* Create an instance of Exception, add the message Modify, and throw it.
- * Modify is appended to the what() message and can contain << for ostream
- * operations.
- *
- * do .. while kludge to swallow trailing ; character
- * http://gcc.gnu.org/onlinedocs/cpp/Swallowing-the-Semicolon.html .
- * Arg can be a constructor argument to the exception.
- */
-#define UTIL_THROW_BACKEND(Condition, Exception, Arg, Modify) do { \
- Exception UTIL_e Arg; \
- UTIL_e.SetLocation(__FILE__, __LINE__, UTIL_FUNC_NAME, #Exception, Condition); \
- UTIL_e << Modify; \
- throw UTIL_e; \
-} while (0)
-
-#define UTIL_THROW_ARG(Exception, Arg, Modify) \
- UTIL_THROW_BACKEND(NULL, Exception, Arg, Modify)
-
-#define UTIL_THROW(Exception, Modify) \
- UTIL_THROW_BACKEND(NULL, Exception, , Modify);
-
-#define UTIL_THROW2(Modify) \
- UTIL_THROW_BACKEND(NULL, util::Exception, , Modify);
-
-#if __GNUC__ >= 3
-#define UTIL_UNLIKELY(x) __builtin_expect (!!(x), 0)
-#else
-#define UTIL_UNLIKELY(x) (x)
-#endif
-
-#if __GNUC__ >= 3
-#define UTIL_LIKELY(x) __builtin_expect (!!(x), 1)
-#else
-#define UTIL_LIKELY(x) (x)
-#endif
-
-#define UTIL_THROW_IF_ARG(Condition, Exception, Arg, Modify) do { \
- if (UTIL_UNLIKELY(Condition)) { \
- UTIL_THROW_BACKEND(#Condition, Exception, Arg, Modify); \
- } \
-} while (0)
-
-#define UTIL_THROW_IF(Condition, Exception, Modify) \
- UTIL_THROW_IF_ARG(Condition, Exception, , Modify)
-
-#define UTIL_THROW_IF2(Condition, Modify) \
- UTIL_THROW_IF_ARG(Condition, util::Exception, , Modify)
-
-// Exception that records errno and adds it to the message.
-class ErrnoException : public Exception {
- public:
- ErrnoException() throw();
-
- virtual ~ErrnoException() throw();
-
- int Error() const throw() { return errno_; }
-
- private:
- int errno_;
-};
-
-// file wasn't there, or couldn't be open for some reason
-class FileOpenException : public Exception {
- public:
- FileOpenException() throw() {}
- ~FileOpenException() throw() {}
-};
-
-// Utilities for overflow checking.
-class OverflowException : public Exception {
- public:
- OverflowException() throw();
- ~OverflowException() throw();
-};
-
-template <unsigned len> inline std::size_t CheckOverflowInternal(uint64_t value) {
- UTIL_THROW_IF(value > static_cast<uint64_t>(std::numeric_limits<std::size_t>::max()), OverflowException, "Integer overflow detected. This model is too big for 32-bit code.");
- return value;
-}
-
-template <> inline std::size_t CheckOverflowInternal<8>(uint64_t value) {
- return value;
-}
-
-inline std::size_t CheckOverflow(uint64_t value) {
- return CheckOverflowInternal<sizeof(std::size_t)>(value);
-}
-
-#if defined(_WIN32) || defined(_WIN64)
-/* Thrown for Windows specific operations. */
-class WindowsException : public Exception {
- public:
- WindowsException() throw();
- ~WindowsException() throw();
-};
-#endif
-
-} // namespace util
-
-#endif // UTIL_EXCEPTION_H
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/fake_ostream.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/util/fake_ostream.hh b/ext/kenlm/util/fake_ostream.hh
deleted file mode 100644
index 2f76053..0000000
--- a/ext/kenlm/util/fake_ostream.hh
+++ /dev/null
@@ -1,111 +0,0 @@
-#ifndef UTIL_FAKE_OSTREAM_H
-#define UTIL_FAKE_OSTREAM_H
-
-#include "util/float_to_string.hh"
-#include "util/integer_to_string.hh"
-#include "util/string_piece.hh"
-
-#include <cassert>
-#include <limits>
-
-#include <stdint.h>
-
-namespace util {
-
-/* Like std::ostream but without being incredibly slow.
- * Supports most of the built-in types except for long double.
- *
- * The FakeOStream class is intended to be inherited from. The inherting class
- * should provide:
- * public:
- * Derived &flush();
- * Derived &write(const void *data, std::size_t length);
- *
- * private: or protected:
- * friend class FakeOStream;
- * char *Ensure(std::size_t amount);
- * void AdvanceTo(char *to);
- *
- * The Ensure function makes enough space for an in-place write and returns
- * where to write. The AdvanceTo function happens after the write, saying how
- * much was actually written.
- *
- * Precondition:
- * amount <= kToStringMaxBytes for in-place writes.
- */
-template <class Derived> class FakeOStream {
- public:
- FakeOStream() {}
-
- // This also covers std::string and char*
- Derived &operator<<(StringPiece str) {
- return C().write(str.data(), str.size());
- }
-
- // Handle integers by size and signedness.
- private:
- template <class Arg> struct EnableIfKludge {
- typedef Derived type;
- };
- template <class From, unsigned Length = sizeof(From), bool Signed = std::numeric_limits<From>::is_signed, bool IsInteger = std::numeric_limits<From>::is_integer> struct Coerce {};
-
- template <class From> struct Coerce<From, 2, false, true> { typedef uint16_t To; };
- template <class From> struct Coerce<From, 4, false, true> { typedef uint32_t To; };
- template <class From> struct Coerce<From, 8, false, true> { typedef uint64_t To; };
-
- template <class From> struct Coerce<From, 2, true, true> { typedef int16_t To; };
- template <class From> struct Coerce<From, 4, true, true> { typedef int32_t To; };
- template <class From> struct Coerce<From, 8, true, true> { typedef int64_t To; };
- public:
- template <class From> typename EnableIfKludge<typename Coerce<From>::To>::type &operator<<(const From value) {
- return CallToString(static_cast<typename Coerce<From>::To>(value));
- }
-
- // Character types that get copied as bytes instead of displayed as integers.
- Derived &operator<<(char val) { return put(val); }
- Derived &operator<<(signed char val) { return put(static_cast<char>(val)); }
- Derived &operator<<(unsigned char val) { return put(static_cast<char>(val)); }
-
- Derived &operator<<(bool val) { return put(val + '0'); }
- // enums will fall back to int but are not caught by the template.
- Derived &operator<<(int val) { return CallToString(static_cast<typename Coerce<int>::To>(val)); }
-
- Derived &operator<<(float val) { return CallToString(val); }
- Derived &operator<<(double val) { return CallToString(val); }
-
- // This is here to catch all the other pointer types.
- Derived &operator<<(const void *value) { return CallToString(value); }
- // This is here because the above line also catches const char*.
- Derived &operator<<(const char *value) { return *this << StringPiece(value); }
- Derived &operator<<(char *value) { return *this << StringPiece(value); }
-
- Derived &put(char val) {
- char *c = C().Ensure(1);
- *c = val;
- C().AdvanceTo(++c);
- return C();
- }
-
- char widen(char val) const { return val; }
-
- private:
- // References to derived class for convenience.
- Derived &C() {
- return *static_cast<Derived*>(this);
- }
-
- const Derived &C() const {
- return *static_cast<const Derived*>(this);
- }
-
- // This is separate to prevent an infinite loop if the compiler considers
- // types the same (i.e. gcc std::size_t and uint64_t or uint32_t).
- template <class T> Derived &CallToString(const T value) {
- C().AdvanceTo(ToString(value, C().Ensure(ToStringBuf<T>::kBytes)));
- return C();
- }
-};
-
-} // namespace
-
-#endif // UTIL_FAKE_OSTREAM_H
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/file.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/util/file.cc b/ext/kenlm/util/file.cc
deleted file mode 100644
index e8976bc..0000000
--- a/ext/kenlm/util/file.cc
+++ /dev/null
@@ -1,574 +0,0 @@
-#define _LARGEFILE64_SOURCE
-#define _FILE_OFFSET_BITS 64
-
-#include "util/file.hh"
-
-#include "util/exception.hh"
-
-#include <algorithm>
-#include <cstdlib>
-#include <cstdio>
-#include <iostream>
-#include <limits>
-#include <sstream>
-
-
-#include <cassert>
-#include <cerrno>
-#include <climits>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <stdint.h>
-
-#if defined(__MINGW32__)
-#include <windows.h>
-#include <unistd.h>
-#warning "The file functions on MinGW have not been tested for file sizes above 2^31 - 1. Please read https://stackoverflow.com/questions/12539488/determine-64-bit-file-size-in-c-on-mingw-32-bit and fix"
-#elif defined(_WIN32) || defined(_WIN64)
-#include <windows.h>
-#include <io.h>
-#else
-#include <unistd.h>
-#endif
-
-namespace util {
-
-scoped_fd::~scoped_fd() {
- if (fd_ != -1 && close(fd_)) {
- std::cerr << "Could not close file " << fd_ << std::endl;
- std::abort();
- }
-}
-
-void scoped_FILE_closer::Close(std::FILE *file) {
- if (file && std::fclose(file)) {
- std::cerr << "Could not close file " << file << std::endl;
- std::abort();
- }
-}
-
-// Note that ErrnoException records errno before NameFromFD is called.
-FDException::FDException(int fd) throw() : fd_(fd), name_guess_(NameFromFD(fd)) {
- *this << "in " << name_guess_ << ' ';
-}
-
-FDException::~FDException() throw() {}
-
-EndOfFileException::EndOfFileException() throw() {
- *this << "End of file";
-}
-EndOfFileException::~EndOfFileException() throw() {}
-
-bool InputFileIsStdin(StringPiece path) {
- return path == "-" || path == "/dev/stdin";
-}
-
-bool OutputFileIsStdout(StringPiece path) {
- return path == "-" || path == "/dev/stdout";
-}
-
-int OpenReadOrThrow(const char *name) {
- int ret;
-#if defined(_WIN32) || defined(_WIN64)
- UTIL_THROW_IF(-1 == (ret = _open(name, _O_BINARY | _O_RDONLY)), ErrnoException, "while opening " << name);
-#else
- UTIL_THROW_IF(-1 == (ret = open(name, O_RDONLY)), ErrnoException, "while opening " << name);
-#endif
- return ret;
-}
-
-int CreateOrThrow(const char *name) {
- int ret;
-#if defined(_WIN32) || defined(_WIN64)
- UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR | _O_BINARY, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name);
-#else
- UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name);
-#endif
- return ret;
-}
-
-uint64_t SizeFile(int fd) {
-#if defined __MINGW32__
- struct stat sb;
- // Does this handle 64-bit?
- int ret = fstat(fd, &sb);
- if (ret == -1 || (!sb.st_size && !S_ISREG(sb.st_mode))) return kBadSize;
- return sb.st_size;
-#elif defined(_WIN32) || defined(_WIN64)
- __int64 ret = _filelengthi64(fd);
- return (ret == -1) ? kBadSize : ret;
-#else // Not windows.
-
-#ifdef OS_ANDROID
- struct stat64 sb;
- int ret = fstat64(fd, &sb);
-#else
- struct stat sb;
- int ret = fstat(fd, &sb);
-#endif
- if (ret == -1 || (!sb.st_size && !S_ISREG(sb.st_mode))) return kBadSize;
- return sb.st_size;
-#endif
-}
-
-uint64_t SizeOrThrow(int fd) {
- uint64_t ret = SizeFile(fd);
- UTIL_THROW_IF_ARG(ret == kBadSize, FDException, (fd), "Failed to size");
- return ret;
-}
-
-void ResizeOrThrow(int fd, uint64_t to) {
-#if defined __MINGW32__
- // Does this handle 64-bit?
- int ret = ftruncate
-#elif defined(_WIN32) || defined(_WIN64)
- errno_t ret = _chsize_s
-#elif defined(OS_ANDROID)
- int ret = ftruncate64
-#else
- int ret = ftruncate
-#endif
- (fd, to);
- UTIL_THROW_IF_ARG(ret, FDException, (fd), "while resizing to " << to << " bytes");
-}
-
-namespace {
-std::size_t GuardLarge(std::size_t size) {
- // The following operating systems have broken read/write/pread/pwrite that
- // only supports up to 2^31.
- // OS X man pages claim to support 64-bit, but Kareem M. Darwish had problems
- // building with larger files, so APPLE is also here.
-#if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) || defined(OS_ANDROID) || defined(__MINGW32__)
- return size < INT_MAX ? size : INT_MAX;
-#else
- return size;
-#endif
-}
-}
-
-#if defined(_WIN32) || defined(_WIN64)
-namespace {
-const std::size_t kMaxDWORD = static_cast<std::size_t>(4294967295UL);
-} // namespace
-#endif
-
-std::size_t PartialRead(int fd, void *to, std::size_t amount) {
-#if defined(_WIN32) || defined(_WIN64)
- DWORD ret;
- HANDLE file_handle = reinterpret_cast<HANDLE>(_get_osfhandle(fd));
- DWORD larger_size = static_cast<DWORD>(std::min<std::size_t>(kMaxDWORD, amount));
- DWORD smaller_size = 28672; // Received reports that 31346 worked but higher values did not. This rounds down to the nearest multiple of 4096, the page size.
- if (!ReadFile(file_handle, to, larger_size, &ret, NULL))
- {
- DWORD last_error = GetLastError();
- if (last_error != ERROR_NOT_ENOUGH_MEMORY || !ReadFile(file_handle, to, smaller_size, &ret, NULL)) {
- UTIL_THROW(WindowsException, "Windows error in ReadFile.");
- }
- }
-#else
- errno = 0;
- ssize_t ret;
- do {
- ret = read(fd, to, GuardLarge(amount));
- } while (ret == -1 && errno == EINTR);
- UTIL_THROW_IF_ARG(ret < 0, FDException, (fd), "while reading " << amount << " bytes");
-#endif
- return static_cast<std::size_t>(ret);
-}
-
-void ReadOrThrow(int fd, void *to_void, std::size_t amount) {
- uint8_t *to = static_cast<uint8_t*>(to_void);
- while (amount) {
- std::size_t ret = PartialRead(fd, to, amount);
- UTIL_THROW_IF(ret == 0, EndOfFileException, " in " << NameFromFD(fd) << " but there should be " << amount << " more bytes to read.");
- amount -= ret;
- to += ret;
- }
-}
-
-std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount) {
- uint8_t *to = static_cast<uint8_t*>(to_void);
- std::size_t remaining = amount;
- while (remaining) {
- std::size_t ret = PartialRead(fd, to, remaining);
- if (!ret) return amount - remaining;
- remaining -= ret;
- to += ret;
- }
- return amount;
-}
-
-void WriteOrThrow(int fd, const void *data_void, std::size_t size) {
- const uint8_t *data = static_cast<const uint8_t*>(data_void);
- while (size) {
-#if defined(_WIN32) || defined(_WIN64)
- int ret;
-#else
- ssize_t ret;
-#endif
- errno = 0;
- do {
- ret =
-#if defined(_WIN32) || defined(_WIN64)
- _write
-#else
- write
-#endif
- (fd, data, GuardLarge(size));
- } while (ret == -1 && errno == EINTR);
- UTIL_THROW_IF_ARG(ret < 1, FDException, (fd), "while writing " << size << " bytes");
- data += ret;
- size -= ret;
- }
-}
-
-void WriteOrThrow(FILE *to, const void *data, std::size_t size) {
- if (!size) return;
- UTIL_THROW_IF(1 != std::fwrite(data, size, 1, to), ErrnoException, "Short write; requested size " << size);
-}
-
-void ErsatzPRead(int fd, void *to_void, std::size_t size, uint64_t off) {
- uint8_t *to = static_cast<uint8_t*>(to_void);
- while (size) {
-#if defined(_WIN32) || defined(_WIN64)
- /* BROKEN: changes file pointer. Even if you save it and change it back, it won't be safe to use concurrently with write() or read() which lmplz does. */
- // size_t might be 64-bit. DWORD is always 32.
- DWORD reading = static_cast<DWORD>(std::min<std::size_t>(kMaxDWORD, size));
- DWORD ret;
- OVERLAPPED overlapped;
- memset(&overlapped, 0, sizeof(OVERLAPPED));
- overlapped.Offset = static_cast<DWORD>(off);
- overlapped.OffsetHigh = static_cast<DWORD>(off >> 32);
- UTIL_THROW_IF(!ReadFile((HANDLE)_get_osfhandle(fd), to, reading, &ret, &overlapped), WindowsException, "ReadFile failed for offset " << off);
-#else
- ssize_t ret;
- errno = 0;
- ret =
-#ifdef OS_ANDROID
- pread64
-#else
- pread
-#endif
- (fd, to, GuardLarge(size), off);
- if (ret <= 0) {
- if (ret == -1 && errno == EINTR) continue;
- UTIL_THROW_IF(ret == 0, EndOfFileException, " for reading " << size << " bytes at " << off << " from " << NameFromFD(fd));
- UTIL_THROW_ARG(FDException, (fd), "while reading " << size << " bytes at offset " << off);
- }
-#endif
- size -= ret;
- off += ret;
- to += ret;
- }
-}
-
-void ErsatzPWrite(int fd, const void *from_void, std::size_t size, uint64_t off) {
- const uint8_t *from = static_cast<const uint8_t*>(from_void);
- while(size) {
-#if defined(_WIN32) || defined(_WIN64)
- /* Changes file pointer. Even if you save it and change it back, it won't be safe to use concurrently with write() or read() */
- // size_t might be 64-bit. DWORD is always 32.
- DWORD writing = static_cast<DWORD>(std::min<std::size_t>(kMaxDWORD, size));
- DWORD ret;
- OVERLAPPED overlapped;
- memset(&overlapped, 0, sizeof(OVERLAPPED));
- overlapped.Offset = static_cast<DWORD>(off);
- overlapped.OffsetHigh = static_cast<DWORD>(off >> 32);
- UTIL_THROW_IF(!WriteFile((HANDLE)_get_osfhandle(fd), from, writing, &ret, &overlapped), Exception, "WriteFile failed for offset " << off);
-#else
- ssize_t ret;
- errno = 0;
- ret =
-#ifdef OS_ANDROID
- pwrite64
-#else
- pwrite
-#endif
- (fd, from, GuardLarge(size), off);
- if (ret <= 0) {
- if (ret == -1 && errno == EINTR) continue;
- UTIL_THROW_IF(ret == 0, EndOfFileException, " for writing " << size << " bytes at " << off << " from " << NameFromFD(fd));
- UTIL_THROW_ARG(FDException, (fd), "while writing " << size << " bytes at offset " << off);
- }
-#endif
- size -= ret;
- off += ret;
- from += ret;
- }
-}
-
-
-void FSyncOrThrow(int fd) {
-// Apparently windows doesn't have fsync?
-#if !defined(_WIN32) && !defined(_WIN64)
- UTIL_THROW_IF_ARG(-1 == fsync(fd), FDException, (fd), "while syncing");
-#endif
-}
-
-namespace {
-
-// Static assert for 64-bit off_t size.
-#if !defined(_WIN32) && !defined(_WIN64) && !defined(OS_ANDROID)
-template <unsigned> struct CheckOffT;
-template <> struct CheckOffT<8> {
- struct True {};
-};
-// If there's a compiler error on the next line, then off_t isn't 64 bit. And
-// that makes me a sad panda.
-typedef CheckOffT<sizeof(off_t)>::True IgnoredType;
-#endif
-
-// Can't we all just get along?
-void InternalSeek(int fd, int64_t off, int whence) {
- if (
-#if defined __MINGW32__
- // Does this handle 64-bit?
- (off_t)-1 == lseek(fd, off, whence)
-#elif defined(_WIN32) || defined(_WIN64)
- (__int64)-1 == _lseeki64(fd, off, whence)
-#elif defined(OS_ANDROID)
- (off64_t)-1 == lseek64(fd, off, whence)
-#else
- (off_t)-1 == lseek(fd, off, whence)
-#endif
- ) UTIL_THROW_ARG(FDException, (fd), "while seeking to " << off << " whence " << whence);
-}
-} // namespace
-
-void SeekOrThrow(int fd, uint64_t off) {
- InternalSeek(fd, off, SEEK_SET);
-}
-
-void AdvanceOrThrow(int fd, int64_t off) {
- InternalSeek(fd, off, SEEK_CUR);
-}
-
-void SeekEnd(int fd) {
- InternalSeek(fd, 0, SEEK_END);
-}
-
-std::FILE *FDOpenOrThrow(scoped_fd &file) {
- std::FILE *ret = fdopen(file.get(), "r+b");
- UTIL_THROW_IF_ARG(!ret, FDException, (file.get()), "Could not fdopen for write");
- file.release();
- return ret;
-}
-
-std::FILE *FDOpenReadOrThrow(scoped_fd &file) {
- std::FILE *ret = fdopen(file.get(), "rb");
- UTIL_THROW_IF_ARG(!ret, FDException, (file.get()), "Could not fdopen for read");
- file.release();
- return ret;
-}
-
-// Sigh. Windows temporary file creation is full of race conditions.
-#if defined(_WIN32) || defined(_WIN64)
-/* mkstemp extracted from libc/sysdeps/posix/tempname.c. Copyright
- (C) 1991-1999, 2000, 2001, 2006 Free Software Foundation, Inc.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version. */
-
-/* This has been modified from the original version to rename the function and
- * set the Windows temporary flag. */
-
-static const char letters[] =
-"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789";
-
-/* Generate a temporary file name based on TMPL. TMPL must match the
- rules for mk[s]temp (i.e. end in "XXXXXX"). The name constructed
- does not exist at the time of the call to mkstemp. TMPL is
- overwritten with the result. */
-int
-mkstemp_and_unlink(char *tmpl)
-{
- int len;
- char *XXXXXX;
- static unsigned long long value;
- unsigned long long random_time_bits;
- unsigned int count;
- int fd = -1;
- int save_errno = errno;
-
- /* A lower bound on the number of temporary files to attempt to
- generate. The maximum total number of temporary file names that
- can exist for a given template is 62**6. It should never be
- necessary to try all these combinations. Instead if a reasonable
- number of names is tried (we define reasonable as 62**3) fail to
- give the system administrator the chance to remove the problems. */
-#define ATTEMPTS_MIN (62 * 62 * 62)
-
- /* The number of times to attempt to generate a temporary file. To
- conform to POSIX, this must be no smaller than TMP_MAX. */
-#if ATTEMPTS_MIN < TMP_MAX
- unsigned int attempts = TMP_MAX;
-#else
- unsigned int attempts = ATTEMPTS_MIN;
-#endif
-
- len = strlen (tmpl);
- if (len < 6 || strcmp (&tmpl[len - 6], "XXXXXX"))
- {
- errno = EINVAL;
- return -1;
- }
-
-/* This is where the Xs start. */
- XXXXXX = &tmpl[len - 6];
-
- /* Get some more or less random data. */
- {
- SYSTEMTIME stNow;
- FILETIME ftNow;
-
- // get system time
- GetSystemTime(&stNow);
- stNow.wMilliseconds = 500;
- if (!SystemTimeToFileTime(&stNow, &ftNow))
- {
- errno = -1;
- return -1;
- }
-
- random_time_bits = (((unsigned long long)ftNow.dwHighDateTime << 32)
- | (unsigned long long)ftNow.dwLowDateTime);
- }
- value += random_time_bits ^ (unsigned long long)GetCurrentThreadId ();
-
- for (count = 0; count < attempts; value += 7777, ++count)
- {
- unsigned long long v = value;
-
- /* Fill in the random bits. */
- XXXXXX[0] = letters[v % 62];
- v /= 62;
- XXXXXX[1] = letters[v % 62];
- v /= 62;
- XXXXXX[2] = letters[v % 62];
- v /= 62;
- XXXXXX[3] = letters[v % 62];
- v /= 62;
- XXXXXX[4] = letters[v % 62];
- v /= 62;
- XXXXXX[5] = letters[v % 62];
-
- /* Modified for windows and to unlink */
- // fd = open (tmpl, O_RDWR | O_CREAT | O_EXCL, _S_IREAD | _S_IWRITE);
- int flags = _O_RDWR | _O_CREAT | _O_EXCL | _O_BINARY;
- flags |= _O_TEMPORARY;
- fd = _open (tmpl, flags, _S_IREAD | _S_IWRITE);
- if (fd >= 0)
- {
- errno = save_errno;
- return fd;
- }
- else if (errno != EEXIST)
- return -1;
- }
-
- /* We got out of the loop because we ran out of combinations to try. */
- errno = EEXIST;
- return -1;
-}
-#else
-int
-mkstemp_and_unlink(char *tmpl) {
- int ret = mkstemp(tmpl);
- if (ret != -1) {
- UTIL_THROW_IF(unlink(tmpl), ErrnoException, "while deleting delete " << tmpl);
- }
- return ret;
-}
-#endif
-
-// If it's a directory, add a /. This lets users say -T /tmp without creating
-// /tmpAAAAAA
-void NormalizeTempPrefix(std::string &base) {
- if (base.empty()) return;
- if (base[base.size() - 1] == '/') return;
- struct stat sb;
- // It's fine for it to not exist.
- if (-1 == stat(base.c_str(), &sb)) return;
- if (
-#if defined(_WIN32) || defined(_WIN64)
- sb.st_mode & _S_IFDIR
-#else
- S_ISDIR(sb.st_mode)
-#endif
- ) base += '/';
-}
-
-int MakeTemp(const StringPiece &base) {
- std::string name(base.data(), base.size());
- name += "XXXXXX";
- name.push_back(0);
- int ret;
- UTIL_THROW_IF(-1 == (ret = mkstemp_and_unlink(&name[0])), ErrnoException, "while making a temporary based on " << base);
- return ret;
-}
-
-std::FILE *FMakeTemp(const StringPiece &base) {
- util::scoped_fd file(MakeTemp(base));
- return FDOpenOrThrow(file);
-}
-
-int DupOrThrow(int fd) {
- int ret = dup(fd);
- UTIL_THROW_IF_ARG(ret == -1, FDException, (fd), "in duplicating the file descriptor");
- return ret;
-}
-
-namespace {
-// Try to name things but be willing to fail too.
-bool TryName(int fd, std::string &out) {
-#if defined(_WIN32) || defined(_WIN64)
- return false;
-#else
- std::string name("/proc/self/fd/");
- std::ostringstream convert;
- convert << fd;
- name += convert.str();
-
- struct stat sb;
- if (-1 == lstat(name.c_str(), &sb))
- return false;
- out.resize(sb.st_size + 1);
- // lstat gave us a size, but I've seen it grow, possibly due to symlinks on top of symlinks.
- while (true) {
- ssize_t ret = readlink(name.c_str(), &out[0], out.size());
- if (-1 == ret)
- return false;
- if ((size_t)ret < out.size()) {
- out.resize(ret);
- break;
- }
- // Exponential growth.
- out.resize(out.size() * 2);
- }
- // Don't use the non-file names.
- if (!out.empty() && out[0] != '/')
- return false;
- return true;
-#endif
-}
-} // namespace
-
-std::string NameFromFD(int fd) {
- std::string ret;
- if (TryName(fd, ret)) return ret;
- switch (fd) {
- case 0: return "stdin";
- case 1: return "stdout";
- case 2: return "stderr";
- }
- ret = "fd ";
- std::ostringstream convert;
- convert << fd;
- ret += convert.str();
- return ret;
-}
-
-} // namespace util
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/file.hh
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/util/file.hh b/ext/kenlm/util/file.hh
deleted file mode 100644
index f7cb4d6..0000000
--- a/ext/kenlm/util/file.hh
+++ /dev/null
@@ -1,154 +0,0 @@
-#ifndef UTIL_FILE_H
-#define UTIL_FILE_H
-
-#include "util/exception.hh"
-#include "util/scoped.hh"
-#include "util/string_piece.hh"
-
-#include <cstddef>
-#include <cstdio>
-#include <string>
-#include <stdint.h>
-
-namespace util {
-
-class scoped_fd {
- public:
- scoped_fd() : fd_(-1) {}
-
- explicit scoped_fd(int fd) : fd_(fd) {}
-
- ~scoped_fd();
-
- void reset(int to = -1) {
- scoped_fd other(fd_);
- fd_ = to;
- }
-
- int get() const { return fd_; }
-
- int operator*() const { return fd_; }
-
- int release() {
- int ret = fd_;
- fd_ = -1;
- return ret;
- }
-
- private:
- int fd_;
-
- scoped_fd(const scoped_fd &);
- scoped_fd &operator=(const scoped_fd &);
-};
-
-struct scoped_FILE_closer {
- static void Close(std::FILE *file);
-};
-typedef scoped<std::FILE, scoped_FILE_closer> scoped_FILE;
-
-/* Thrown for any operation where the fd is known. */
-class FDException : public ErrnoException {
- public:
- explicit FDException(int fd) throw();
-
- virtual ~FDException() throw();
-
- // This may no longer be valid if the exception was thrown past open.
- int FD() const { return fd_; }
-
- // Guess from NameFromFD.
- const std::string &NameGuess() const { return name_guess_; }
-
- private:
- int fd_;
-
- std::string name_guess_;
-};
-
-// End of file reached.
-class EndOfFileException : public Exception {
- public:
- EndOfFileException() throw();
- ~EndOfFileException() throw();
-};
-
-// Open for read only.
-int OpenReadOrThrow(const char *name);
-// Create file if it doesn't exist, truncate if it does. Opened for write.
-int CreateOrThrow(const char *name);
-
-/** Does the given input file path denote standard input?
- *
- * Returns true if, and only if, path is either "-" or "/dev/stdin".
- *
- * Opening standard input as a file may need some special treatment for
- * portability. There's a convention that a dash ("-") in place of an input
- * file path denotes standard input, but opening "/dev/stdin" may need to be
- * special as well.
- */
-bool InputPathIsStdin(StringPiece path);
-
-/** Does the given output file path denote standard output?
- *
- * Returns true if, and only if, path is either "-" or "/dev/stdout".
- *
- * Opening standard output as a file may need some special treatment for
- * portability. There's a convention that a dash ("-") in place of an output
- * file path denotes standard output, but opening "/dev/stdout" may need to be
- * special as well.
- */
-bool OutputPathIsStdout(StringPiece path);
-
-// Return value for SizeFile when it can't size properly.
-const uint64_t kBadSize = (uint64_t)-1;
-uint64_t SizeFile(int fd);
-uint64_t SizeOrThrow(int fd);
-
-void ResizeOrThrow(int fd, uint64_t to);
-
-std::size_t PartialRead(int fd, void *to, std::size_t size);
-void ReadOrThrow(int fd, void *to, std::size_t size);
-std::size_t ReadOrEOF(int fd, void *to_void, std::size_t size);
-
-void WriteOrThrow(int fd, const void *data_void, std::size_t size);
-void WriteOrThrow(FILE *to, const void *data, std::size_t size);
-
-/* These call pread/pwrite in a loop. However, on Windows they call ReadFile/
- * WriteFile which changes the file pointer. So it's safe to call ErsatzPRead
- * and ErsatzPWrite concurrently (or any combination thereof). But it changes
- * the file pointer on windows, so it's not safe to call concurrently with
- * anything that uses the implicit file pointer e.g. the Read/Write functions
- * above.
- */
-void ErsatzPRead(int fd, void *to, std::size_t size, uint64_t off);
-void ErsatzPWrite(int fd, const void *data_void, std::size_t size, uint64_t off);
-
-void FSyncOrThrow(int fd);
-
-// Seeking
-void SeekOrThrow(int fd, uint64_t off);
-void AdvanceOrThrow(int fd, int64_t off);
-void SeekEnd(int fd);
-
-std::FILE *FDOpenOrThrow(scoped_fd &file);
-std::FILE *FDOpenReadOrThrow(scoped_fd &file);
-
-// Temporary files
-// Append a / if base is a directory.
-void NormalizeTempPrefix(std::string &base);
-int MakeTemp(const StringPiece &prefix);
-std::FILE *FMakeTemp(const StringPiece &prefix);
-
-// dup an fd.
-int DupOrThrow(int fd);
-
-/* Attempt get file name from fd. This won't always work (i.e. on Windows or
- * a pipe). The file might have been renamed. It's intended for diagnostics
- * and logging only.
- */
-std::string NameFromFD(int fd);
-
-} // namespace util
-
-#endif // UTIL_FILE_H
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6da3961b/ext/kenlm/util/file_piece.cc
----------------------------------------------------------------------
diff --git a/ext/kenlm b/ext/kenlm
new file mode 160000
index 0000000..56fdb5c
--- /dev/null
+++ b/ext/kenlm
@@ -0,0 +1 @@
+Subproject commit 56fdb5c44fca34d5a2e07d96139c28fb163983c5
diff --git a/ext/kenlm/util/file_piece.cc b/ext/kenlm/util/file_piece.cc
deleted file mode 100644
index 0a4d3a9..0000000
--- a/ext/kenlm/util/file_piece.cc
+++ /dev/null
@@ -1,337 +0,0 @@
-#include "util/file_piece.hh"
-
-#include "util/double-conversion/double-conversion.h"
-#include "util/exception.hh"
-#include "util/file.hh"
-#include "util/mmap.hh"
-
-#if defined(_WIN32) || defined(_WIN64)
-#include <io.h>
-#else
-#include <unistd.h>
-#endif
-
-#include <cassert>
-#include <cerrno>
-#include <cmath>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <string>
-
-#include <fcntl.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-namespace util {
-
-ParseNumberException::ParseNumberException(StringPiece value) throw() {
- *this << "Could not parse \"" << value << "\" into a ";
-}
-
-// Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale).
-const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
-
-FilePiece::FilePiece(const char *name, std::ostream *show_progress, std::size_t min_buffer) :
- file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(SizePage()),
- progress_(total_size_, total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name) {
- Initialize(name, show_progress, min_buffer);
-}
-
-namespace {
-std::string NamePossiblyFind(int fd, const char *name) {
- if (name) return name;
- return NameFromFD(fd);
-}
-} // namespace
-
-FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std::size_t min_buffer) :
- file_(fd), total_size_(SizeFile(file_.get())), page_(SizePage()),
- progress_(total_size_, total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + NamePossiblyFind(fd, name)) {
- Initialize(NamePossiblyFind(fd, name).c_str(), show_progress, min_buffer);
-}
-
-FilePiece::FilePiece(std::istream &stream, const char *name, std::size_t min_buffer) :
- total_size_(kBadSize), page_(SizePage()) {
- InitializeNoRead("istream", min_buffer);
-
- fallback_to_read_ = true;
- HugeMalloc(default_map_size_, false, data_);
- position_ = data_.begin();
- position_end_ = position_;
-
- fell_back_.Reset(stream);
-}
-
-FilePiece::~FilePiece() {}
-
-StringPiece FilePiece::ReadLine(char delim, bool strip_cr) {
- std::size_t skip = 0;
- while (true) {
- for (const char *i = position_ + skip; i < position_end_; ++i) {
- if (*i == delim) {
- // End of line.
- // Take 1 byte off the end if it's an unwanted carriage return.
- const std::size_t subtract_cr = (
- (strip_cr && i > position_ && *(i - 1) == '\r') ?
- 1 : 0);
- StringPiece ret(position_, i - position_ - subtract_cr);
- position_ = i + 1;
- return ret;
- }
- }
- if (at_end_) {
- if (position_ == position_end_) {
- Shift();
- }
- return Consume(position_end_);
- }
- skip = position_end_ - position_;
- Shift();
- }
-}
-
-bool FilePiece::ReadLineOrEOF(StringPiece &to, char delim, bool strip_cr) {
- try {
- to = ReadLine(delim, strip_cr);
- } catch (const util::EndOfFileException &e) { return false; }
- return true;
-}
-
-float FilePiece::ReadFloat() {
- return ReadNumber<float>();
-}
-double FilePiece::ReadDouble() {
- return ReadNumber<double>();
-}
-long int FilePiece::ReadLong() {
- return ReadNumber<long int>();
-}
-unsigned long int FilePiece::ReadULong() {
- return ReadNumber<unsigned long int>();
-}
-
-// Factored out so that istream can call this.
-void FilePiece::InitializeNoRead(const char *name, std::size_t min_buffer) {
- file_name_ = name;
-
- default_map_size_ = page_ * std::max<std::size_t>((min_buffer / page_ + 1), 2);
- position_ = NULL;
- position_end_ = NULL;
- mapped_offset_ = 0;
- at_end_ = false;
-}
-
-void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer) {
- InitializeNoRead(name, min_buffer);
-
- if (total_size_ == kBadSize) {
- // So the assertion passes.
- fallback_to_read_ = false;
- if (show_progress)
- *show_progress << "File " << name << " isn't normal. Using slower read() instead of mmap(). No progress bar." << std::endl;
- TransitionToRead();
- } else {
- fallback_to_read_ = false;
- }
- Shift();
- // gzip detect.
- if ((position_end_ >= position_ + ReadCompressed::kMagicSize) && ReadCompressed::DetectCompressedMagic(position_)) {
- if (!fallback_to_read_) {
- at_end_ = false;
- TransitionToRead();
- }
- }
-}
-
-namespace {
-
-static const double_conversion::StringToDoubleConverter kConverter(
- double_conversion::StringToDoubleConverter::ALLOW_TRAILING_JUNK | double_conversion::StringToDoubleConverter::ALLOW_LEADING_SPACES,
- std::numeric_limits<double>::quiet_NaN(),
- std::numeric_limits<double>::quiet_NaN(),
- "inf",
- "NaN");
-
-StringPiece FirstToken(StringPiece str) {
- const char *i;
- for (i = str.data(); i != str.data() + str.size(); ++i) {
- if (kSpaces[(unsigned char)*i]) break;
- }
- return StringPiece(str.data(), i - str.data());
-}
-
-const char *ParseNumber(StringPiece str, float &out) {
- int count;
- out = kConverter.StringToFloat(str.data(), str.size(), &count);
- UTIL_THROW_IF_ARG(std::isnan(out) && str != "NaN" && str != "nan", ParseNumberException, (FirstToken(str)), "float");
- return str.data() + count;
-}
-const char *ParseNumber(StringPiece str, double &out) {
- int count;
- out = kConverter.StringToDouble(str.data(), str.size(), &count);
- UTIL_THROW_IF_ARG(std::isnan(out) && str != "NaN" && str != "nan", ParseNumberException, (FirstToken(str)), "double");
- return str.data() + count;
-}
-const char *ParseNumber(StringPiece str, long int &out) {
- char *end;
- errno = 0;
- out = strtol(str.data(), &end, 10);
- UTIL_THROW_IF_ARG(errno || (end == str.data()), ParseNumberException, (FirstToken(str)), "long int");
- return end;
-}
-const char *ParseNumber(StringPiece str, unsigned long int &out) {
- char *end;
- errno = 0;
- out = strtoul(str.data(), &end, 10);
- UTIL_THROW_IF_ARG(errno || (end == str.data()), ParseNumberException, (FirstToken(str)), "unsigned long int");
- return end;
-}
-} // namespace
-
-template <class T> T FilePiece::ReadNumber() {
- SkipSpaces();
- while (last_space_ < position_) {
- if (UTIL_UNLIKELY(at_end_)) {
- // Hallucinate a null off the end of the file.
- std::string buffer(position_, position_end_);
- T ret;
- // Has to be null-terminated.
- const char *begin = buffer.c_str();
- const char *end = ParseNumber(StringPiece(begin, buffer.size()), ret);
- position_ += end - begin;
- return ret;
- }
- Shift();
- }
- T ret;
- position_ = ParseNumber(StringPiece(position_, last_space_ - position_), ret);
- return ret;
-}
-
-const char *FilePiece::FindDelimiterOrEOF(const bool *delim) {
- std::size_t skip = 0;
- while (true) {
- for (const char *i = position_ + skip; i < position_end_; ++i) {
- if (delim[static_cast<unsigned char>(*i)]) return i;
- }
- if (at_end_) {
- if (position_ == position_end_) Shift();
- return position_end_;
- }
- skip = position_end_ - position_;
- Shift();
- }
-}
-
-void FilePiece::Shift() {
- if (at_end_) {
- progress_.Finished();
- throw EndOfFileException();
- }
- uint64_t desired_begin = position_ - data_.begin() + mapped_offset_;
-
- if (!fallback_to_read_) MMapShift(desired_begin);
- // Notice an mmap failure might set the fallback.
- if (fallback_to_read_) ReadShift();
-
- for (last_space_ = position_end_ - 1; last_space_ >= position_; --last_space_) {
- if (kSpaces[static_cast<unsigned char>(*last_space_)]) break;
- }
-}
-
-void FilePiece::MMapShift(uint64_t desired_begin) {
- // Use mmap.
- uint64_t ignore = desired_begin % page_;
- // Duplicate request for Shift means give more data.
- if (position_ == data_.begin() + ignore && position_) {
- default_map_size_ *= 2;
- }
- // Local version so that in case of failure it doesn't overwrite the class variable.
- uint64_t mapped_offset = desired_begin - ignore;
-
- uint64_t mapped_size;
- if (default_map_size_ >= static_cast<std::size_t>(total_size_ - mapped_offset)) {
- at_end_ = true;
- mapped_size = total_size_ - mapped_offset;
- } else {
- mapped_size = default_map_size_;
- }
-
- // Forcibly clear the existing mmap first.
- data_.reset();
- try {
- MapRead(POPULATE_OR_LAZY, *file_, mapped_offset, mapped_size, data_);
- } catch (const util::ErrnoException &e) {
- if (desired_begin) {
- SeekOrThrow(*file_, desired_begin);
- }
- // The mmap was scheduled to end the file, but now we're going to read it.
- at_end_ = false;
- TransitionToRead();
- return;
- }
- mapped_offset_ = mapped_offset;
- position_ = data_.begin() + ignore;
- position_end_ = data_.begin() + mapped_size;
-
- progress_.Set(desired_begin);
-}
-
-void FilePiece::TransitionToRead() {
- assert(!fallback_to_read_);
- fallback_to_read_ = true;
- data_.reset();
- HugeMalloc(default_map_size_, false, data_);
- position_ = data_.begin();
- position_end_ = position_;
-
- try {
- fell_back_.Reset(file_.release());
- } catch (util::Exception &e) {
- e << " in file " << file_name_;
- throw;
- }
-}
-
-void FilePiece::ReadShift() {
- assert(fallback_to_read_);
- // Bytes [data_.begin(), position_) have been consumed.
- // Bytes [position_, position_end_) have been read into the buffer.
-
- // Start at the beginning of the buffer if there's nothing useful in it.
- if (position_ == position_end_) {
- mapped_offset_ += (position_end_ - data_.begin());
- position_ = data_.begin();
- position_end_ = position_;
- }
-
- std::size_t already_read = position_end_ - data_.begin();
-
- if (already_read == default_map_size_) {
- if (position_ == data_.begin()) {
- // Buffer too small.
- std::size_t valid_length = position_end_ - position_;
- default_map_size_ *= 2;
- HugeRealloc(default_map_size_, false, data_);
- position_ = data_.begin();
- position_end_ = position_ + valid_length;
- } else {
- std::size_t moving = position_end_ - position_;
- memmove(data_.get(), position_, moving);
- position_ = data_.begin();
- position_end_ = position_ + moving;
- already_read = moving;
- }
- }
-
- std::size_t read_return = fell_back_.Read(static_cast<uint8_t*>(data_.get()) + already_read, default_map_size_ - already_read);
- progress_.Set(fell_back_.RawAmount());
-
- if (read_return == 0) {
- at_end_ = true;
- }
- position_end_ += read_return;
-}
-
-} // namespace util