You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2019/06/18 17:35:36 UTC

[arrow] branch master updated: ARROW-5608: [C++][parquet] Fix invalid memory access when using parquet::arrow::ColumnReader

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 6f096a9  ARROW-5608: [C++][parquet] Fix invalid memory access when using parquet::arrow::ColumnReader
6f096a9 is described below

commit 6f096a95761d97e453d05ef56dd61a44a79fc266
Author: Hatem Helal <hh...@mathworks.com>
AuthorDate: Tue Jun 18 12:35:28 2019 -0500

    ARROW-5608: [C++][parquet] Fix invalid memory access when using parquet::arrow::ColumnReader
    
    This fix appears to resolve the ASAN failures detected by this [gist](https://gist.github.com/hatemhelal/892e76a48e5b372f0e28a34403893ddd#file-reader-writer-cc-L130).
    
    This invalid memory access is only observed when:
    
    * The `batch_size` is smaller than the row group
    * There are zero repetition levels.
    
    Included with this patch is a type parameterized unittest (might be overkill) that I've qualified on macOS.
    
    Author: Hatem Helal <hh...@mathworks.com>
    
    Closes #4574 from hatemhelal/arrow-5608 and squashes the following commits:
    
    0b37baac7 <Hatem Helal> Add unittest for invalid memory access when calling ColumnReader::NextBatch
    5566a4abc <Hatem Helal> clang-format changes
    491db7e65 <Hatem Helal> only resize rep_levels buffer for nodes with non-zero repetition level
---
 cpp/src/parquet/arrow/arrow-reader-writer-test.cc | 34 +++++++++++++++++++++++
 cpp/src/parquet/arrow/record_reader.cc            | 10 ++++---
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
index 21f6f04..5425e7e 100644
--- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -856,6 +856,40 @@ TYPED_TEST(TestParquetIO, FileMetaDataWrite) {
   ASSERT_EQ(metadata->RowGroup(0)->num_rows(), metadata_written->RowGroup(0)->num_rows());
 }
 
+TYPED_TEST(TestParquetIO, CheckIterativeColumnRead) {
+  // ARROW-5608: Test using ColumnReader with small batch size (1) and non-repeated
+  // nullable fields with ASAN.
+  std::shared_ptr<Array> values;
+  ASSERT_OK(NonNullArray<TypeParam>(SMALL_SIZE, &values));
+  std::shared_ptr<Table> table = MakeSimpleTable(values, true);
+  this->ResetSink();
+  ASSERT_OK_NO_THROW(WriteTable(*table, ::arrow::default_memory_pool(), this->sink_,
+                                values->length(), default_writer_properties()));
+
+  std::unique_ptr<FileReader> reader;
+  this->ReaderFromSink(&reader);
+  std::unique_ptr<ColumnReader> column_reader;
+  ASSERT_OK_NO_THROW(reader->GetColumn(0, &column_reader));
+  ASSERT_NE(nullptr, column_reader.get());
+
+  // Read one record at a time.
+  std::vector<std::shared_ptr<::arrow::Array>> batches;
+
+  for (int64_t i = 0; i < values->length(); ++i) {
+    std::shared_ptr<::arrow::ChunkedArray> batch;
+    ASSERT_OK_NO_THROW(column_reader->NextBatch(1, &batch));
+    ASSERT_EQ(1, batch->length());
+    ASSERT_EQ(1, batch->num_chunks());
+    batches.push_back(batch->chunk(0));
+  }
+
+  auto chunked = std::make_shared<::arrow::ChunkedArray>(batches);
+  auto chunked_col =
+      std::make_shared<::arrow::Column>(table->schema()->field(0), chunked);
+  auto chunked_table = ::arrow::Table::Make(table->schema(), {chunked_col});
+  ASSERT_TRUE(table->Equals(*chunked_table));
+}
+
 using TestInt96ParquetIO = TestParquetIO<::arrow::TimestampType>;
 
 TEST_F(TestInt96ParquetIO, ReadIntoTimestamp) {
diff --git a/cpp/src/parquet/arrow/record_reader.cc b/cpp/src/parquet/arrow/record_reader.cc
index e6f20a5..b7f9d90 100644
--- a/cpp/src/parquet/arrow/record_reader.cc
+++ b/cpp/src/parquet/arrow/record_reader.cc
@@ -353,12 +353,14 @@ class RecordReader::RecordReaderImpl {
       int16_t* rep_data = rep_levels();
 
       std::copy(def_data + levels_position_, def_data + levels_written_, def_data);
-      std::copy(rep_data + levels_position_, rep_data + levels_written_, rep_data);
-
       PARQUET_THROW_NOT_OK(
           def_levels_->Resize(levels_remaining * sizeof(int16_t), false));
-      PARQUET_THROW_NOT_OK(
-          rep_levels_->Resize(levels_remaining * sizeof(int16_t), false));
+
+      if (max_rep_level_ > 0) {
+        std::copy(rep_data + levels_position_, rep_data + levels_written_, rep_data);
+        PARQUET_THROW_NOT_OK(
+            rep_levels_->Resize(levels_remaining * sizeof(int16_t), false));
+      }
 
       levels_written_ -= levels_position_;
       levels_position_ = 0;