You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2020/05/13 01:15:26 UTC
[arrow] 10/17: PARQUET-1857: [C++] Do not fail to read unencrypted
files with over 32767 row groups. Change some DCHECKs causing segfaults to
throw exceptions
This is an automated email from the ASF dual-hosted git repository.
kszucs pushed a commit to branch maint-0.17.x
in repository https://gitbox.apache.org/repos/asf/arrow.git
commit 5fb626dcf7c6cf0ab347f32c157e053c5c99fbc8
Author: Wes McKinney <we...@apache.org>
AuthorDate: Wed May 6 11:28:35 2020 -0500
PARQUET-1857: [C++] Do not fail to read unencrypted files with over 32767 row groups. Change some DCHECKs causing segfaults to throw exceptions
While writing files with so many row groups is a bad idea, people will still do it and currently the C++ library will segfault in release builds when trying to read such a file. This removes those potential segfaults and enables reading the many-row-group files again. Files with encrypted row group metadata with that many row groups cannot be read because the Parquet metadata uses an int16 row group ordinal key.
Closes #7108 from wesm/PARQUET-1857
Authored-by: Wes McKinney <we...@apache.org>
Signed-off-by: Wes McKinney <we...@apache.org>
---
cpp/src/parquet/file_reader.cc | 40 +++++++++++++++++++++++-------------
cpp/src/parquet/reader_test.cc | 11 ++++++++++
python/pyarrow/tests/test_parquet.py | 11 ++++++++++
3 files changed, 48 insertions(+), 14 deletions(-)
diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc
index 167321c..46c28a5 100644
--- a/cpp/src/parquet/file_reader.cc
+++ b/cpp/src/parquet/file_reader.cc
@@ -57,9 +57,12 @@ RowGroupReader::RowGroupReader(std::unique_ptr<Contents> contents)
: contents_(std::move(contents)) {}
std::shared_ptr<ColumnReader> RowGroupReader::Column(int i) {
- DCHECK(i < metadata()->num_columns())
- << "The RowGroup only has " << metadata()->num_columns()
- << "columns, requested column: " << i;
+ if (i >= metadata()->num_columns()) {
+ std::stringstream ss;
+ ss << "Trying to read column index " << i << " but row group metadata has only "
+ << metadata()->num_columns() << " columns";
+ throw ParquetException(ss.str());
+ }
const ColumnDescriptor* descr = metadata()->schema()->Column(i);
std::unique_ptr<PageReader> page_reader = contents_->GetColumnPageReader(i);
@@ -69,9 +72,12 @@ std::shared_ptr<ColumnReader> RowGroupReader::Column(int i) {
}
std::unique_ptr<PageReader> RowGroupReader::GetColumnPageReader(int i) {
- DCHECK(i < metadata()->num_columns())
- << "The RowGroup only has " << metadata()->num_columns()
- << "columns, requested column: " << i;
+ if (i >= metadata()->num_columns()) {
+ std::stringstream ss;
+ ss << "Trying to read column index " << i << " but row group metadata has only "
+ << metadata()->num_columns() << " columns";
+ throw ParquetException(ss.str());
+ }
return contents_->GetColumnPageReader(i);
}
@@ -136,6 +142,11 @@ class SerializedRowGroup : public RowGroupReader::Contents {
throw ParquetException("RowGroup is noted as encrypted but no file decryptor");
}
+ constexpr auto kEncryptedRowGroupsLimit = 32767;
+ if (i > kEncryptedRowGroupsLimit) {
+ throw ParquetException("Encrypted files cannot contain more than 32767 row groups");
+ }
+
// The column is encrypted
std::shared_ptr<Decryptor> meta_decryptor;
std::shared_ptr<Decryptor> data_decryptor;
@@ -170,7 +181,7 @@ class SerializedRowGroup : public RowGroupReader::Contents {
FileMetaData* file_metadata_;
std::unique_ptr<RowGroupMetaData> row_group_metadata_;
ReaderProperties properties_;
- int16_t row_group_ordinal_;
+ int row_group_ordinal_;
std::shared_ptr<InternalFileDecryptor> file_decryptor_;
};
@@ -200,9 +211,8 @@ class SerializedFile : public ParquetFileReader::Contents {
}
std::shared_ptr<RowGroupReader> GetRowGroup(int i) override {
- std::unique_ptr<SerializedRowGroup> contents(
- new SerializedRowGroup(source_, source_size_, file_metadata_.get(),
- static_cast<int16_t>(i), properties_, file_decryptor_));
+ std::unique_ptr<SerializedRowGroup> contents(new SerializedRowGroup(
+ source_, source_size_, file_metadata_.get(), i, properties_, file_decryptor_));
return std::make_shared<RowGroupReader>(std::move(contents));
}
@@ -529,10 +539,12 @@ std::shared_ptr<FileMetaData> ParquetFileReader::metadata() const {
}
std::shared_ptr<RowGroupReader> ParquetFileReader::RowGroup(int i) {
- DCHECK(i < metadata()->num_row_groups())
- << "The file only has " << metadata()->num_row_groups()
- << "row groups, requested reader for: " << i;
-
+ if (i >= metadata()->num_row_groups()) {
+ std::stringstream ss;
+ ss << "Trying to read row group " << i << " but file only has "
+ << metadata()->num_row_groups() << " row groups";
+ throw ParquetException(ss.str());
+ }
return contents_->GetRowGroup(i);
}
diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc
index f28b226..a271075 100644
--- a/cpp/src/parquet/reader_test.cc
+++ b/cpp/src/parquet/reader_test.cc
@@ -105,6 +105,17 @@ TEST_F(TestAllTypesPlain, TestBatchRead) {
ASSERT_FALSE(col->HasNext());
}
+TEST_F(TestAllTypesPlain, RowGroupColumnBoundchecking) {
+ // Part of PARQUET-1857
+ ASSERT_THROW(reader_->RowGroup(reader_->metadata()->num_row_groups()),
+ ParquetException);
+
+ auto row_group = reader_->RowGroup(0);
+ ASSERT_THROW(row_group->Column(row_group->metadata()->num_columns()), ParquetException);
+ ASSERT_THROW(row_group->GetColumnPageReader(row_group->metadata()->num_columns()),
+ ParquetException);
+}
+
TEST_F(TestAllTypesPlain, TestFlatScannerInt32) {
std::shared_ptr<RowGroupReader> group = reader_->RowGroup(0);
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 86eb964..f215eaa 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -288,6 +288,17 @@ def test_special_chars_filename(tempdir, use_legacy_dataset):
assert table_read.equals(table)
+@pytest.mark.slow
+def test_file_with_over_int16_max_row_groups():
+ # PARQUET-1857: Parquet encryption support introduced a INT16_MAX upper
+ # limit on the number of row groups, but this limit only impacts files with
+ # encrypted row group metadata because of the int16 row group ordinal used
+ # in the Parquet Thrift metadata. Unencrypted files are not impacted, so
+ # this test checks that it works (even if it isn't a good idea)
+ t = pa.table([list(range(40000))], names=['f0'])
+ _check_roundtrip(t, row_group_size=1)
+
+
@pytest.mark.pandas
@parametrize_legacy_dataset
def test_empty_table_roundtrip(use_legacy_dataset):