You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2020/05/13 01:15:26 UTC

[arrow] 10/17: PARQUET-1857: [C++] Do not fail to read unencrypted files with over 32767 row groups. Change some DCHECKs causing segfaults to throw exceptions

This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch maint-0.17.x
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 5fb626dcf7c6cf0ab347f32c157e053c5c99fbc8
Author: Wes McKinney <we...@apache.org>
AuthorDate: Wed May 6 11:28:35 2020 -0500

    PARQUET-1857: [C++] Do not fail to read unencrypted files with over 32767 row groups. Change some DCHECKs causing segfaults to throw exceptions
    
    While writing files with so many row groups is a bad idea, people will still do it and currently the C++ library will segfault in release builds when trying to read such a file. This removes those potential segfaults and enables reading the many-row-group files again. Files with encrypted row group metadata with that many row groups cannot be read because the Parquet metadata uses an int16 row group ordinal key.
    
    Closes #7108 from wesm/PARQUET-1857
    
    Authored-by: Wes McKinney <we...@apache.org>
    Signed-off-by: Wes McKinney <we...@apache.org>
---
 cpp/src/parquet/file_reader.cc       | 40 +++++++++++++++++++++++-------------
 cpp/src/parquet/reader_test.cc       | 11 ++++++++++
 python/pyarrow/tests/test_parquet.py | 11 ++++++++++
 3 files changed, 48 insertions(+), 14 deletions(-)

diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc
index 167321c..46c28a5 100644
--- a/cpp/src/parquet/file_reader.cc
+++ b/cpp/src/parquet/file_reader.cc
@@ -57,9 +57,12 @@ RowGroupReader::RowGroupReader(std::unique_ptr<Contents> contents)
     : contents_(std::move(contents)) {}
 
 std::shared_ptr<ColumnReader> RowGroupReader::Column(int i) {
-  DCHECK(i < metadata()->num_columns())
-      << "The RowGroup only has " << metadata()->num_columns()
-      << "columns, requested column: " << i;
+  if (i >= metadata()->num_columns()) {
+    std::stringstream ss;
+    ss << "Trying to read column index " << i << " but row group metadata has only "
+       << metadata()->num_columns() << " columns";
+    throw ParquetException(ss.str());
+  }
   const ColumnDescriptor* descr = metadata()->schema()->Column(i);
 
   std::unique_ptr<PageReader> page_reader = contents_->GetColumnPageReader(i);
@@ -69,9 +72,12 @@ std::shared_ptr<ColumnReader> RowGroupReader::Column(int i) {
 }
 
 std::unique_ptr<PageReader> RowGroupReader::GetColumnPageReader(int i) {
-  DCHECK(i < metadata()->num_columns())
-      << "The RowGroup only has " << metadata()->num_columns()
-      << "columns, requested column: " << i;
+  if (i >= metadata()->num_columns()) {
+    std::stringstream ss;
+    ss << "Trying to read column index " << i << " but row group metadata has only "
+       << metadata()->num_columns() << " columns";
+    throw ParquetException(ss.str());
+  }
   return contents_->GetColumnPageReader(i);
 }
 
@@ -136,6 +142,11 @@ class SerializedRowGroup : public RowGroupReader::Contents {
       throw ParquetException("RowGroup is noted as encrypted but no file decryptor");
     }
 
+    constexpr auto kEncryptedRowGroupsLimit = 32767;
+    if (i > kEncryptedRowGroupsLimit) {
+      throw ParquetException("Encrypted files cannot contain more than 32767 row groups");
+    }
+
     // The column is encrypted
     std::shared_ptr<Decryptor> meta_decryptor;
     std::shared_ptr<Decryptor> data_decryptor;
@@ -170,7 +181,7 @@ class SerializedRowGroup : public RowGroupReader::Contents {
   FileMetaData* file_metadata_;
   std::unique_ptr<RowGroupMetaData> row_group_metadata_;
   ReaderProperties properties_;
-  int16_t row_group_ordinal_;
+  int row_group_ordinal_;
   std::shared_ptr<InternalFileDecryptor> file_decryptor_;
 };
 
@@ -200,9 +211,8 @@ class SerializedFile : public ParquetFileReader::Contents {
   }
 
   std::shared_ptr<RowGroupReader> GetRowGroup(int i) override {
-    std::unique_ptr<SerializedRowGroup> contents(
-        new SerializedRowGroup(source_, source_size_, file_metadata_.get(),
-                               static_cast<int16_t>(i), properties_, file_decryptor_));
+    std::unique_ptr<SerializedRowGroup> contents(new SerializedRowGroup(
+        source_, source_size_, file_metadata_.get(), i, properties_, file_decryptor_));
     return std::make_shared<RowGroupReader>(std::move(contents));
   }
 
@@ -529,10 +539,12 @@ std::shared_ptr<FileMetaData> ParquetFileReader::metadata() const {
 }
 
 std::shared_ptr<RowGroupReader> ParquetFileReader::RowGroup(int i) {
-  DCHECK(i < metadata()->num_row_groups())
-      << "The file only has " << metadata()->num_row_groups()
-      << "row groups, requested reader for: " << i;
-
+  if (i >= metadata()->num_row_groups()) {
+    std::stringstream ss;
+    ss << "Trying to read row group " << i << " but file only has "
+       << metadata()->num_row_groups() << " row groups";
+    throw ParquetException(ss.str());
+  }
   return contents_->GetRowGroup(i);
 }
 
diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc
index f28b226..a271075 100644
--- a/cpp/src/parquet/reader_test.cc
+++ b/cpp/src/parquet/reader_test.cc
@@ -105,6 +105,17 @@ TEST_F(TestAllTypesPlain, TestBatchRead) {
   ASSERT_FALSE(col->HasNext());
 }
 
+TEST_F(TestAllTypesPlain, RowGroupColumnBoundchecking) {
+  // Part of PARQUET-1857
+  ASSERT_THROW(reader_->RowGroup(reader_->metadata()->num_row_groups()),
+               ParquetException);
+
+  auto row_group = reader_->RowGroup(0);
+  ASSERT_THROW(row_group->Column(row_group->metadata()->num_columns()), ParquetException);
+  ASSERT_THROW(row_group->GetColumnPageReader(row_group->metadata()->num_columns()),
+               ParquetException);
+}
+
 TEST_F(TestAllTypesPlain, TestFlatScannerInt32) {
   std::shared_ptr<RowGroupReader> group = reader_->RowGroup(0);
 
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 86eb964..f215eaa 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -288,6 +288,17 @@ def test_special_chars_filename(tempdir, use_legacy_dataset):
     assert table_read.equals(table)
 
 
+@pytest.mark.slow
+def test_file_with_over_int16_max_row_groups():
+    # PARQUET-1857: Parquet encryption support introduced a INT16_MAX upper
+    # limit on the number of row groups, but this limit only impacts files with
+    # encrypted row group metadata because of the int16 row group ordinal used
+    # in the Parquet Thrift metadata. Unencrypted files are not impacted, so
+    # this test checks that it works (even if it isn't a good idea)
+    t = pa.table([list(range(40000))], names=['f0'])
+    _check_roundtrip(t, row_group_size=1)
+
+
 @pytest.mark.pandas
 @parametrize_legacy_dataset
 def test_empty_table_roundtrip(use_legacy_dataset):