You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by fs...@apache.org on 2019/07/12 01:23:37 UTC

[arrow] branch master updated: PARQUET-1623: [C++] Fix invalid memory access encountered when reading some parquet files

This is an automated email from the ASF dual-hosted git repository.

fsaintjacques pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 2a1e02b  PARQUET-1623: [C++] Fix invalid memory access encountered when reading some parquet files
2a1e02b is described below

commit 2a1e02bf2075f7ef1adb322e88bd00e1d761dcf2
Author: Hatem Helal <hh...@mathworks.com>
AuthorDate: Thu Jul 11 21:23:18 2019 -0400

    PARQUET-1623: [C++] Fix invalid memory access encountered when reading some parquet files
    
    There are a number of conditions that are needed to observe this invalid memory access when reading from a parquet file:
    
    * Must have a number of records equal to a power of two
    * The column must be serialized across multiple data pages
    
    Author: Hatem Helal <hh...@mathworks.com>
    
    Closes #4857 from hatemhelal/parquet-1623 and squashes the following commits:
    
    5cf75b5b4 <Hatem Helal> Add simplified unittest
    d3c80073e <Hatem Helal> Initialize BitmapWriter with num_def_levels to prevent invalid memory access
---
 cpp/src/parquet/column_reader-test.cc | 22 ++++++++++++++++++++++
 cpp/src/parquet/column_reader.h       |  2 +-
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/cpp/src/parquet/column_reader-test.cc b/cpp/src/parquet/column_reader-test.cc
index 46d099e..b6b928c 100644
--- a/cpp/src/parquet/column_reader-test.cc
+++ b/cpp/src/parquet/column_reader-test.cc
@@ -410,5 +410,27 @@ TEST(TestColumnReader, DefinitionLevelsToBitmap) {
   ASSERT_EQ(current_byte, valid_bits[1]);
 }
 
+TEST(TestColumnReader, DefinitionLevelsToBitmapPowerOfTwo) {
+  // PARQUET-1623: Invalid memory access when decoding a valid bits vector that has a
+  // length equal to a power of two and also using a non-zero valid_bits_offset.  This
+  // should not fail when run with ASAN or valgrind.
+  std::vector<int16_t> def_levels = {3, 3, 3, 2, 3, 3, 3, 3};
+  std::vector<int16_t> rep_levels = {0, 1, 1, 1, 1, 1, 1, 1};
+  std::vector<uint8_t> valid_bits(1, 0);
+
+  const int max_def_level = 3;
+  const int max_rep_level = 1;
+
+  int64_t values_read = -1;
+  int64_t null_count = 0;
+
+  // Read the latter half of the validity bitmap
+  internal::DefinitionLevelsToBitmap(def_levels.data() + 4, 4, max_def_level,
+                                     max_rep_level, &values_read, &null_count,
+                                     valid_bits.data(), 4 /* valid_bits_offset */);
+  ASSERT_EQ(4, values_read);
+  ASSERT_EQ(0, null_count);
+}
+
 }  // namespace test
 }  // namespace parquet
diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index e7d6afb..15987c5 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -186,7 +186,7 @@ static inline void DefinitionLevelsToBitmap(
   // We assume here that valid_bits is large enough to accommodate the
   // additional definition levels and the ones that have already been written
   ::arrow::internal::BitmapWriter valid_bits_writer(valid_bits, valid_bits_offset,
-                                                    valid_bits_offset + num_def_levels);
+                                                    num_def_levels);
 
   // TODO(itaiin): As an interim solution we are splitting the code path here
   // between repeated+flat column reads, and non-repeated+nested reads.