You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/06/18 17:35:36 UTC
[arrow] branch master updated: ARROW-5608: [C++][parquet] Fix
invalid memory access when using parquet::arrow::ColumnReader
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 6f096a9 ARROW-5608: [C++][parquet] Fix invalid memory access when using parquet::arrow::ColumnReader
6f096a9 is described below
commit 6f096a95761d97e453d05ef56dd61a44a79fc266
Author: Hatem Helal <hh...@mathworks.com>
AuthorDate: Tue Jun 18 12:35:28 2019 -0500
ARROW-5608: [C++][parquet] Fix invalid memory access when using parquet::arrow::ColumnReader
This fix appears to resolve the ASAN failures detected by this [gist](https://gist.github.com/hatemhelal/892e76a48e5b372f0e28a34403893ddd#file-reader-writer-cc-L130).
This invalid memory access is only observed when:
* The `batch_size` is smaller than the row group
* There are zero repetition levels.
Included with this patch is a type parameterized unittest (might be overkill) that I've qualified on macOS.
Author: Hatem Helal <hh...@mathworks.com>
Closes #4574 from hatemhelal/arrow-5608 and squashes the following commits:
0b37baac7 <Hatem Helal> Add unittest for invalid memory access when calling ColumnReader::NextBatch
5566a4abc <Hatem Helal> clang-format changes
491db7e65 <Hatem Helal> only resize rep_levels buffer for nodes with non-zero repetition level
---
cpp/src/parquet/arrow/arrow-reader-writer-test.cc | 34 +++++++++++++++++++++++
cpp/src/parquet/arrow/record_reader.cc | 10 ++++---
2 files changed, 40 insertions(+), 4 deletions(-)
diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
index 21f6f04..5425e7e 100644
--- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -856,6 +856,40 @@ TYPED_TEST(TestParquetIO, FileMetaDataWrite) {
ASSERT_EQ(metadata->RowGroup(0)->num_rows(), metadata_written->RowGroup(0)->num_rows());
}
+TYPED_TEST(TestParquetIO, CheckIterativeColumnRead) {
+ // ARROW-5608: Test using ColumnReader with small batch size (1) and non-repeated
+ // nullable fields with ASAN.
+ std::shared_ptr<Array> values;
+ ASSERT_OK(NonNullArray<TypeParam>(SMALL_SIZE, &values));
+ std::shared_ptr<Table> table = MakeSimpleTable(values, true);
+ this->ResetSink();
+ ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), this->sink_,
+ values->length(), default_writer_properties()));
+
+ std::unique_ptr<FileReader> reader;
+ this->ReaderFromSink(&reader);
+ std::unique_ptr<ColumnReader> column_reader;
+ ASSERT_OK_NO_THROW(reader->GetColumn(0, &column_reader));
+ ASSERT_NE(nullptr, column_reader.get());
+
+ // Read one record at a time.
+ std::vector<std::shared_ptr<::arrow::Array>> batches;
+
+ for (int64_t i = 0; i < values->length(); ++i) {
+ std::shared_ptr<::arrow::ChunkedArray> batch;
+ ASSERT_OK_NO_THROW(column_reader->NextBatch(1, &batch));
+ ASSERT_EQ(1, batch->length());
+ ASSERT_EQ(1, batch->num_chunks());
+ batches.push_back(batch->chunk(0));
+ }
+
+ auto chunked = std::make_shared<::arrow::ChunkedArray>(batches);
+ auto chunked_col =
+ std::make_shared<::arrow::Column>(table->schema()->field(0), chunked);
+ auto chunked_table = ::arrow::Table::Make(table->schema(), {chunked_col});
+ ASSERT_TRUE(table->Equals(*chunked_table));
+}
+
using TestInt96ParquetIO = TestParquetIO<::arrow::TimestampType>;
TEST_F(TestInt96ParquetIO, ReadIntoTimestamp) {
diff --git a/cpp/src/parquet/arrow/record_reader.cc b/cpp/src/parquet/arrow/record_reader.cc
index e6f20a5..b7f9d90 100644
--- a/cpp/src/parquet/arrow/record_reader.cc
+++ b/cpp/src/parquet/arrow/record_reader.cc
@@ -353,12 +353,14 @@ class RecordReader::RecordReaderImpl {
int16_t* rep_data = rep_levels();
std::copy(def_data + levels_position_, def_data + levels_written_, def_data);
- std::copy(rep_data + levels_position_, rep_data + levels_written_, rep_data);
-
PARQUET_THROW_NOT_OK(
def_levels_->Resize(levels_remaining * sizeof(int16_t), false));
- PARQUET_THROW_NOT_OK(
- rep_levels_->Resize(levels_remaining * sizeof(int16_t), false));
+
+ if (max_rep_level_ > 0) {
+ std::copy(rep_data + levels_position_, rep_data + levels_written_, rep_data);
+ PARQUET_THROW_NOT_OK(
+ rep_levels_->Resize(levels_remaining * sizeof(int16_t), false));
+ }
levels_written_ -= levels_position_;
levels_position_ = 0;