You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by fs...@apache.org on 2019/07/12 01:23:37 UTC
[arrow] branch master updated: PARQUET-1623: [C++] Fix invalid
memory access encountered when reading some parquet files
This is an automated email from the ASF dual-hosted git repository.
fsaintjacques pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 2a1e02b PARQUET-1623: [C++] Fix invalid memory access encountered when reading some parquet files
2a1e02b is described below
commit 2a1e02bf2075f7ef1adb322e88bd00e1d761dcf2
Author: Hatem Helal <hh...@mathworks.com>
AuthorDate: Thu Jul 11 21:23:18 2019 -0400
PARQUET-1623: [C++] Fix invalid memory access encountered when reading some parquet files
There are a number of conditions that are needed to observe this invalid memory access when reading from a parquet file:
* Must have a number of records equal to a power of two
* The column must be serialized across multiple data pages
Author: Hatem Helal <hh...@mathworks.com>
Closes #4857 from hatemhelal/parquet-1623 and squashes the following commits:
5cf75b5b4 <Hatem Helal> Add simplified unittest
d3c80073e <Hatem Helal> Initialize BitmapWriter with num_def_levels to prevent invalid memory access
---
cpp/src/parquet/column_reader-test.cc | 22 ++++++++++++++++++++++
cpp/src/parquet/column_reader.h | 2 +-
2 files changed, 23 insertions(+), 1 deletion(-)
diff --git a/cpp/src/parquet/column_reader-test.cc b/cpp/src/parquet/column_reader-test.cc
index 46d099e..b6b928c 100644
--- a/cpp/src/parquet/column_reader-test.cc
+++ b/cpp/src/parquet/column_reader-test.cc
@@ -410,5 +410,27 @@ TEST(TestColumnReader, DefinitionLevelsToBitmap) {
ASSERT_EQ(current_byte, valid_bits[1]);
}
+TEST(TestColumnReader, DefinitionLevelsToBitmapPowerOfTwo) {
+ // PARQUET-1623: Invalid memory access when decoding a valid bits vector that has a
+ // length equal to a power of two and also using a non-zero valid_bits_offset. This
+ // should not fail when run with ASAN or valgrind.
+ std::vector<int16_t> def_levels = {3, 3, 3, 2, 3, 3, 3, 3};
+ std::vector<int16_t> rep_levels = {0, 1, 1, 1, 1, 1, 1, 1};
+ std::vector<uint8_t> valid_bits(1, 0);
+
+ const int max_def_level = 3;
+ const int max_rep_level = 1;
+
+ int64_t values_read = -1;
+ int64_t null_count = 0;
+
+ // Read the latter half of the validity bitmap
+ internal::DefinitionLevelsToBitmap(def_levels.data() + 4, 4, max_def_level,
+ max_rep_level, &values_read, &null_count,
+ valid_bits.data(), 4 /* valid_bits_offset */);
+ ASSERT_EQ(4, values_read);
+ ASSERT_EQ(0, null_count);
+}
+
} // namespace test
} // namespace parquet
diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h
index e7d6afb..15987c5 100644
--- a/cpp/src/parquet/column_reader.h
+++ b/cpp/src/parquet/column_reader.h
@@ -186,7 +186,7 @@ static inline void DefinitionLevelsToBitmap(
// We assume here that valid_bits is large enough to accommodate the
// additional definition levels and the ones that have already been written
::arrow::internal::BitmapWriter valid_bits_writer(valid_bits, valid_bits_offset,
- valid_bits_offset + num_def_levels);
+ num_def_levels);
// TODO(itaiin): As an interim solution we are splitting the code path here
// between repeated+flat column reads, and non-repeated+nested reads.