You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ma...@apache.org on 2024/01/27 07:02:19 UTC

(arrow) branch main updated: GH-39527: [C++][Parquet] Validate page sizes before truncating to int32 (#39528)

This is an automated email from the ASF dual-hosted git repository.

maplefu pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 5d7f66190e GH-39527: [C++][Parquet] Validate page sizes before truncating to int32 (#39528)
5d7f66190e is described below

commit 5d7f66190eec05f4fc0b9a1a17635ca6c68c828f
Author: emkornfield <em...@gmail.com>
AuthorDate: Fri Jan 26 23:02:12 2024 -0800

    GH-39527: [C++][Parquet] Validate page sizes before truncating to int32 (#39528)
    
    Be defensive instead of writing invalid data.
    
    ### Rationale for this change
    
    Users can provide this API pages that are large to validly write and we silently truncate lengths before writing.
    
    ### What changes are included in this PR?
    
    Add validations and throw an exception if sizes are too large (this was previously checked only if page indexes are being build).
    
    ### Are these changes tested?
    
    Unit tested
    
    ### Are there any user-facing changes?
    
    This might start raising exceptions instead of writing out invalid parquet files.
    
    Closes #39527
    
    **This PR contains a "Critical Fix".**
    * Closes: #39527
    
    Lead-authored-by: emkornfield <em...@gmail.com>
    Co-authored-by: Micah Kornfield <mi...@google.com>
    Co-authored-by: mwish <ma...@gmail.com>
    Co-authored-by: Antoine Pitrou <pi...@free.fr>
    Co-authored-by: Gang Wu <us...@gmail.com>
    Signed-off-by: mwish <ma...@gmail.com>
---
 cpp/src/parquet/column_writer.cc      | 29 ++++++++++++++++++----
 cpp/src/parquet/column_writer_test.cc | 45 +++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+), 4 deletions(-)

diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index 12b2837fbf..23366b2daa 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -271,7 +271,12 @@ class SerializedPageWriter : public PageWriter {
   }
 
   int64_t WriteDictionaryPage(const DictionaryPage& page) override {
-    int64_t uncompressed_size = page.size();
+    int64_t uncompressed_size = page.buffer()->size();
+    if (uncompressed_size > std::numeric_limits<int32_t>::max()) {
+      throw ParquetException(
+          "Uncompressed dictionary page size overflows INT32_MAX. Size:",
+          uncompressed_size);
+    }
     std::shared_ptr<Buffer> compressed_data;
     if (has_compressor()) {
       auto buffer = std::static_pointer_cast<ResizableBuffer>(
@@ -288,6 +293,11 @@ class SerializedPageWriter : public PageWriter {
     dict_page_header.__set_is_sorted(page.is_sorted());
 
     const uint8_t* output_data_buffer = compressed_data->data();
+    if (compressed_data->size() > std::numeric_limits<int32_t>::max()) {
+      throw ParquetException(
+          "Compressed dictionary page size overflows INT32_MAX. Size: ",
+          uncompressed_size);
+    }
     int32_t output_data_len = static_cast<int32_t>(compressed_data->size());
 
     if (data_encryptor_.get()) {
@@ -371,18 +381,29 @@ class SerializedPageWriter : public PageWriter {
     const int64_t uncompressed_size = page.uncompressed_size();
     std::shared_ptr<Buffer> compressed_data = page.buffer();
     const uint8_t* output_data_buffer = compressed_data->data();
-    int32_t output_data_len = static_cast<int32_t>(compressed_data->size());
+    int64_t output_data_len = compressed_data->size();
+
+    if (output_data_len > std::numeric_limits<int32_t>::max()) {
+      throw ParquetException("Compressed data page size overflows INT32_MAX. Size:",
+                             output_data_len);
+    }
 
     if (data_encryptor_.get()) {
       PARQUET_THROW_NOT_OK(encryption_buffer_->Resize(
           data_encryptor_->CiphertextSizeDelta() + output_data_len, false));
       UpdateEncryption(encryption::kDataPage);
-      output_data_len = data_encryptor_->Encrypt(compressed_data->data(), output_data_len,
+      output_data_len = data_encryptor_->Encrypt(compressed_data->data(),
+                                                 static_cast<int32_t>(output_data_len),
                                                  encryption_buffer_->mutable_data());
       output_data_buffer = encryption_buffer_->data();
     }
 
     format::PageHeader page_header;
+
+    if (uncompressed_size > std::numeric_limits<int32_t>::max()) {
+      throw ParquetException("Uncompressed data page size overflows INT32_MAX. Size:",
+                             uncompressed_size);
+    }
     page_header.__set_uncompressed_page_size(static_cast<int32_t>(uncompressed_size));
     page_header.__set_compressed_page_size(static_cast<int32_t>(output_data_len));
 
@@ -421,7 +442,7 @@ class SerializedPageWriter : public PageWriter {
     if (offset_index_builder_ != nullptr) {
       const int64_t compressed_size = output_data_len + header_size;
       if (compressed_size > std::numeric_limits<int32_t>::max()) {
-        throw ParquetException("Compressed page size overflows to INT32_MAX.");
+        throw ParquetException("Compressed page size overflows INT32_MAX.");
       }
       if (!page.first_row_index().has_value()) {
         throw ParquetException("First row index is not set in data page.");
diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc
index 59fc848d7f..97421629d2 100644
--- a/cpp/src/parquet/column_writer_test.cc
+++ b/cpp/src/parquet/column_writer_test.cc
@@ -15,9 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <memory>
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
 #include "arrow/io/buffered.h"
@@ -25,6 +27,7 @@
 #include "arrow/util/bit_util.h"
 #include "arrow/util/bitmap_builders.h"
 
+#include "parquet/column_page.h"
 #include "parquet/column_reader.h"
 #include "parquet/column_writer.h"
 #include "parquet/file_reader.h"
@@ -479,6 +482,9 @@ using TestValuesWriterInt64Type = TestPrimitiveWriter<Int64Type>;
 using TestByteArrayValuesWriter = TestPrimitiveWriter<ByteArrayType>;
 using TestFixedLengthByteArrayValuesWriter = TestPrimitiveWriter<FLBAType>;
 
+using ::testing::HasSubstr;
+using ::testing::ThrowsMessage;
+
 TYPED_TEST(TestPrimitiveWriter, RequiredPlain) {
   this->TestRequiredWithEncoding(Encoding::PLAIN);
 }
@@ -889,6 +895,45 @@ TEST_F(TestByteArrayValuesWriter, CheckDefaultStats) {
   ASSERT_TRUE(this->metadata_is_stats_set());
 }
 
+TEST(TestPageWriter, ThrowsOnPagesTooLarge) {
+  NodePtr item = schema::Int32("item");  // optional item
+  NodePtr list(GroupNode::Make("b", Repetition::REPEATED, {item}, ConvertedType::LIST));
+  NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list}));  // optional list
+  std::vector<NodePtr> fields = {bag};
+  NodePtr root = GroupNode::Make("schema", Repetition::REPEATED, fields);
+
+  SchemaDescriptor schema;
+  schema.Init(root);
+
+  auto sink = CreateOutputStream();
+  auto props = WriterProperties::Builder().build();
+
+  auto metadata = ColumnChunkMetaDataBuilder::Make(props, schema.Column(0));
+  std::unique_ptr<PageWriter> pager =
+      PageWriter::Open(sink, Compression::UNCOMPRESSED, metadata.get());
+
+  uint8_t data;
+  std::shared_ptr<Buffer> buffer =
+      std::make_shared<Buffer>(&data, std::numeric_limits<int32_t>::max() + int64_t{1});
+  DataPageV1 over_compressed_limit(buffer, /*num_values=*/100, Encoding::BIT_PACKED,
+                                   Encoding::BIT_PACKED, Encoding::BIT_PACKED,
+                                   /*uncompressed_size=*/100);
+  EXPECT_THAT([&]() { pager->WriteDataPage(over_compressed_limit); },
+              ThrowsMessage<ParquetException>(HasSubstr("overflows INT32_MAX")));
+  DictionaryPage dictionary_over_compressed_limit(buffer, /*num_values=*/100,
+                                                  Encoding::PLAIN);
+  EXPECT_THAT([&]() { pager->WriteDictionaryPage(dictionary_over_compressed_limit); },
+              ThrowsMessage<ParquetException>(HasSubstr("overflows INT32_MAX")));
+
+  buffer = std::make_shared<Buffer>(&data, 1);
+  DataPageV1 over_uncompressed_limit(
+      buffer, /*num_values=*/100, Encoding::BIT_PACKED, Encoding::BIT_PACKED,
+      Encoding::BIT_PACKED,
+      /*uncompressed_size=*/std::numeric_limits<int32_t>::max() + int64_t{1});
+  EXPECT_THAT([&]() { pager->WriteDataPage(over_compressed_limit); },
+              ThrowsMessage<ParquetException>(HasSubstr("overflows INT32_MAX")));
+}
+
 TEST(TestColumnWriter, RepeatedListsUpdateSpacedBug) {
   // In ARROW-3930 we discovered a bug when writing from Arrow when we had data
   // that looks like this: