You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by em...@apache.org on 2022/07/06 16:02:20 UTC
[arrow] branch master updated: PARQUET-2163: Handle decimal schemas with large fixed_len_byte_arrays
This is an automated email from the ASF dual-hosted git repository.
emkornfield pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 423ca163a2 PARQUET-2163: Handle decimal schemas with large fixed_len_byte_arrays
423ca163a2 is described below
commit 423ca163a26781ef6a8229af22b7e6e2d7423a54
Author: William Butler <wa...@google.com>
AuthorDate: Wed Jul 6 09:02:00 2022 -0700
PARQUET-2163: Handle decimal schemas with large fixed_len_byte_arrays
The precision calculation had been overflowing to infinity when the
length of the fixed_len_byte_array > 128, triggering an error when then
trying to convert infinity to an int32. We can actually simplify the
logic by noting that log_b(a^(x)) = log_b(a)*x. This avoids the
intermediate infinity. We also added a check for extremely large value
sizes implying a max precision that cannot fit in int32. Even 129 byte
decimal seems extreme.
The formula Parquet C++ was using is technically incorrect vs the
Parquet specification. The specification says that the max precision is
floor(log_10(2^(B*8 -1) - 1)), where the C++ implementation was omitting the
outer -1. However, this is okay as it is easy to prove that these values
will always be the same (ignoring the realities of FP arithmetic) & in
practice all three formulas agree through 128 when using FP.
Bug found through fuzzing.
Closes #13456 from tachyonwill/float_overflow
Authored-by: William Butler <wa...@google.com>
Signed-off-by: Micah Kornfield <em...@gmail.com>
---
cpp/src/parquet/schema_test.cc | 28 ++++++++++++++++++++++++++++
cpp/src/parquet/types.cc | 8 +++++++-
2 files changed, 35 insertions(+), 1 deletion(-)
diff --git a/cpp/src/parquet/schema_test.cc b/cpp/src/parquet/schema_test.cc
index 703bac8108..603d9ed8e2 100644
--- a/cpp/src/parquet/schema_test.cc
+++ b/cpp/src/parquet/schema_test.cc
@@ -1688,9 +1688,26 @@ TEST(TestSchemaNodeCreation, FactoryExceptions) {
ASSERT_ANY_THROW(PrimitiveNode::Make("interval", Repetition::REQUIRED,
IntervalLogicalType::Make(),
Type::FIXED_LEN_BYTE_ARRAY, 11));
+ // Scale is greater than precision.
+ ASSERT_ANY_THROW(PrimitiveNode::Make("decimal", Repetition::REQUIRED,
+ DecimalLogicalType::Make(10, 11), Type::INT64));
+ ASSERT_ANY_THROW(PrimitiveNode::Make("decimal", Repetition::REQUIRED,
+ DecimalLogicalType::Make(17, 18), Type::INT64));
// Primitive too small for given precision ...
ASSERT_ANY_THROW(PrimitiveNode::Make("decimal", Repetition::REQUIRED,
DecimalLogicalType::Make(16, 6), Type::INT32));
+ ASSERT_ANY_THROW(PrimitiveNode::Make("decimal", Repetition::REQUIRED,
+ DecimalLogicalType::Make(10, 9), Type::INT32));
+ ASSERT_ANY_THROW(PrimitiveNode::Make("decimal", Repetition::REQUIRED,
+ DecimalLogicalType::Make(19, 17), Type::INT64));
+ ASSERT_ANY_THROW(PrimitiveNode::Make("decimal", Repetition::REQUIRED,
+ DecimalLogicalType::Make(308, 6),
+ Type::FIXED_LEN_BYTE_ARRAY, 128));
+ // Length is too long
+ ASSERT_ANY_THROW(PrimitiveNode::Make("decimal", Repetition::REQUIRED,
+ DecimalLogicalType::Make(10, 6),
+ Type::FIXED_LEN_BYTE_ARRAY, 891723283));
+
// Incompatible primitive length ...
ASSERT_ANY_THROW(PrimitiveNode::Make("uuid", Repetition::REQUIRED,
UUIDLogicalType::Make(),
@@ -1942,6 +1959,17 @@ TEST_F(TestDecimalSchemaElementConstruction, DecimalCases) {
true, check_DECIMAL},
{"decimal", LogicalType::Decimal(11, 11), Type::INT64, -1, true,
ConvertedType::DECIMAL, true, check_DECIMAL},
+ {"decimal", LogicalType::Decimal(9, 9), Type::INT32, -1, true,
+ ConvertedType::DECIMAL, true, check_DECIMAL},
+ {"decimal", LogicalType::Decimal(18, 18), Type::INT64, -1, true,
+ ConvertedType::DECIMAL, true, check_DECIMAL},
+ {"decimal", LogicalType::Decimal(307, 7), Type::FIXED_LEN_BYTE_ARRAY, 128, true,
+ ConvertedType::DECIMAL, true, check_DECIMAL},
+ {"decimal", LogicalType::Decimal(310, 32), Type::FIXED_LEN_BYTE_ARRAY, 129, true,
+ ConvertedType::DECIMAL, true, check_DECIMAL},
+ {"decimal", LogicalType::Decimal(2147483645, 2147483645),
+ Type::FIXED_LEN_BYTE_ARRAY, 891723282, true, ConvertedType::DECIMAL, true,
+ check_DECIMAL},
};
for (const SchemaElementConstructionArguments& c : cases) {
diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc
index ef23c40662..349fc682aa 100644
--- a/cpp/src/parquet/types.cc
+++ b/cpp/src/parquet/types.cc
@@ -909,8 +909,14 @@ bool LogicalType::Impl::Decimal::is_applicable(parquet::Type::type primitive_typ
}
} break;
case parquet::Type::FIXED_LEN_BYTE_ARRAY: {
+ // If the primitive length is larger than this we will overflow int32 when
+ // calculating precision.
+ if (primitive_length <= 0 || primitive_length > 891723282) {
+ ok = false;
+ break;
+ }
ok = precision_ <= static_cast<int32_t>(std::floor(
- std::log10(std::pow(2.0, (8.0 * primitive_length) - 1.0))));
+ std::log10(2) * ((8.0 * primitive_length) - 1.0)));
} break;
case parquet::Type::BYTE_ARRAY: {
ok = true;