You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by em...@apache.org on 2022/07/06 16:02:20 UTC

[arrow] branch master updated: PARQUET-2163: Handle decimal schemas with large fixed_len_byte_arrays

This is an automated email from the ASF dual-hosted git repository.

emkornfield pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 423ca163a2 PARQUET-2163:  Handle decimal schemas with large fixed_len_byte_arrays
423ca163a2 is described below

commit 423ca163a26781ef6a8229af22b7e6e2d7423a54
Author: William Butler <wa...@google.com>
AuthorDate: Wed Jul 6 09:02:00 2022 -0700

    PARQUET-2163:  Handle decimal schemas with large fixed_len_byte_arrays
    
    The precision calculation had been overflowing to infinity when the
    length of the fixed_len_byte_array > 128, triggering an error when then
    trying to convert infinity to an int32. We can actually simplify the
    logic by noting that log_b(a^(x)) = log_b(a)*x. This avoids the
    intermediate infinity. We also added a check for extremely large value
    sizes implying a max precision that cannot fit in int32. Even 129 byte
    decimal seems extreme.
    
    The formula Parquet C++ was using is technically incorrect vs the
    Parquet specification. The specification says that the max precision is
    floor(log_10(2^(B*8 -1) - 1)), where the C++ implementation was omitting the
    outer -1. However, this is okay as it is easy to prove that these values
    will always be the same (ignoring the realities of FP arithmetic) & in
    practice all three formulas agree through 128 when using FP.
    
    Bug found through fuzzing.
    
    Closes #13456 from tachyonwill/float_overflow
    
    Authored-by: William Butler <wa...@google.com>
    Signed-off-by: Micah Kornfield <em...@gmail.com>
---
 cpp/src/parquet/schema_test.cc | 28 ++++++++++++++++++++++++++++
 cpp/src/parquet/types.cc       |  8 +++++++-
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/cpp/src/parquet/schema_test.cc b/cpp/src/parquet/schema_test.cc
index 703bac8108..603d9ed8e2 100644
--- a/cpp/src/parquet/schema_test.cc
+++ b/cpp/src/parquet/schema_test.cc
@@ -1688,9 +1688,26 @@ TEST(TestSchemaNodeCreation, FactoryExceptions) {
   ASSERT_ANY_THROW(PrimitiveNode::Make("interval", Repetition::REQUIRED,
                                        IntervalLogicalType::Make(),
                                        Type::FIXED_LEN_BYTE_ARRAY, 11));
+  // Scale is greater than precision.
+  ASSERT_ANY_THROW(PrimitiveNode::Make("decimal", Repetition::REQUIRED,
+                                       DecimalLogicalType::Make(10, 11), Type::INT64));
+  ASSERT_ANY_THROW(PrimitiveNode::Make("decimal", Repetition::REQUIRED,
+                                       DecimalLogicalType::Make(17, 18), Type::INT64));
   // Primitive too small for given precision ...
   ASSERT_ANY_THROW(PrimitiveNode::Make("decimal", Repetition::REQUIRED,
                                        DecimalLogicalType::Make(16, 6), Type::INT32));
+  ASSERT_ANY_THROW(PrimitiveNode::Make("decimal", Repetition::REQUIRED,
+                                       DecimalLogicalType::Make(10, 9), Type::INT32));
+  ASSERT_ANY_THROW(PrimitiveNode::Make("decimal", Repetition::REQUIRED,
+                                       DecimalLogicalType::Make(19, 17), Type::INT64));
+  ASSERT_ANY_THROW(PrimitiveNode::Make("decimal", Repetition::REQUIRED,
+                                       DecimalLogicalType::Make(308, 6),
+                                       Type::FIXED_LEN_BYTE_ARRAY, 128));
+  // Length is too long
+  ASSERT_ANY_THROW(PrimitiveNode::Make("decimal", Repetition::REQUIRED,
+                                       DecimalLogicalType::Make(10, 6),
+                                       Type::FIXED_LEN_BYTE_ARRAY, 891723283));
+
   // Incompatible primitive length ...
   ASSERT_ANY_THROW(PrimitiveNode::Make("uuid", Repetition::REQUIRED,
                                        UUIDLogicalType::Make(),
@@ -1942,6 +1959,17 @@ TEST_F(TestDecimalSchemaElementConstruction, DecimalCases) {
        true, check_DECIMAL},
       {"decimal", LogicalType::Decimal(11, 11), Type::INT64, -1, true,
        ConvertedType::DECIMAL, true, check_DECIMAL},
+      {"decimal", LogicalType::Decimal(9, 9), Type::INT32, -1, true,
+       ConvertedType::DECIMAL, true, check_DECIMAL},
+      {"decimal", LogicalType::Decimal(18, 18), Type::INT64, -1, true,
+       ConvertedType::DECIMAL, true, check_DECIMAL},
+      {"decimal", LogicalType::Decimal(307, 7), Type::FIXED_LEN_BYTE_ARRAY, 128, true,
+       ConvertedType::DECIMAL, true, check_DECIMAL},
+      {"decimal", LogicalType::Decimal(310, 32), Type::FIXED_LEN_BYTE_ARRAY, 129, true,
+       ConvertedType::DECIMAL, true, check_DECIMAL},
+      {"decimal", LogicalType::Decimal(2147483645, 2147483645),
+       Type::FIXED_LEN_BYTE_ARRAY, 891723282, true, ConvertedType::DECIMAL, true,
+       check_DECIMAL},
   };
 
   for (const SchemaElementConstructionArguments& c : cases) {
diff --git a/cpp/src/parquet/types.cc b/cpp/src/parquet/types.cc
index ef23c40662..349fc682aa 100644
--- a/cpp/src/parquet/types.cc
+++ b/cpp/src/parquet/types.cc
@@ -909,8 +909,14 @@ bool LogicalType::Impl::Decimal::is_applicable(parquet::Type::type primitive_typ
       }
     } break;
     case parquet::Type::FIXED_LEN_BYTE_ARRAY: {
+      // If the primitive length is larger than this we will overflow int32 when
+      // calculating precision.
+      if (primitive_length <= 0 || primitive_length > 891723282) {
+        ok = false;
+        break;
+      }
       ok = precision_ <= static_cast<int32_t>(std::floor(
-                             std::log10(std::pow(2.0, (8.0 * primitive_length) - 1.0))));
+                             std::log10(2) * ((8.0 * primitive_length) - 1.0)));
     } break;
     case parquet::Type::BYTE_ARRAY: {
       ok = true;