You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/12/06 20:26:14 UTC
[arrow] branch master updated: PARQUET-1471: [C++] TypedStatistics::UpdateSpaced reads out of bounds value when there are more definition levels than spaced values

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new b731b58  PARQUET-1471: [C++] TypedStatistics<T>::UpdateSpaced reads out of bounds value when there are more definition levels than spaced values
b731b58 is described below

commit b731b583e870726121d0adeebad014981e05b36c
Author: Wes McKinney <we...@apache.org>
AuthorDate: Thu Dec 6 14:26:05 2018 -0600

    PARQUET-1471: [C++] TypedStatistics<T>::UpdateSpaced reads out of bounds value when there are more definition levels than spaced values
    
    This bug arose in https://github.com/apache/arrow/pull/3074 as a result of random data changing
    
    Author: Wes McKinney <we...@apache.org>
    
    Closes #3114 from wesm/PARQUET-1471 and squashes the following commits:
    
    4108d8e60 <Wes McKinney> MSVC was including src/parquet/arrow/test-util.h instead of src/arrow/test-util.h
    755c9905f <Wes McKinney> Add missing include guard to parquet/arrow/test-util.h
    8410efa6a <Wes McKinney> Unit test reproducing valgrind error on optional list
    6632f7cd2 <Wes McKinney> Fix out of bounds access when updating column statistics when max_definition_level > 1
---
 cpp/src/arrow/test-util.h                         |  5 +--
 cpp/src/parquet/arrow/arrow-reader-writer-test.cc | 12 +++---
 cpp/src/parquet/arrow/test-util.h                 | 20 ++++-----
 cpp/src/parquet/arrow/writer.cc                   |  2 +-
 cpp/src/parquet/column_writer-test.cc             | 49 +++++++++++++++++++++++
 cpp/src/parquet/column_writer.cc                  | 20 ++++-----
 cpp/src/parquet/column_writer.h                   |  5 +--
 cpp/src/parquet/test-specialization.h             |  5 +--
 cpp/src/parquet/test-util.h                       |  5 +--
 9 files changed, 76 insertions(+), 47 deletions(-)

diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h
index 3011f28..663817b 100644
--- a/cpp/src/arrow/test-util.h
+++ b/cpp/src/arrow/test-util.h
@@ -15,8 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#ifndef ARROW_TEST_UTIL_H_
-#define ARROW_TEST_UTIL_H_
+#pragma once
 
 #ifndef _WIN32
 #include <sys/stat.h>
@@ -409,5 +408,3 @@ class BatchIterator : public RecordBatchReader {
 };
 
 }  // namespace arrow
-
-#endif  // ARROW_TEST_UTIL_H_
diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
index b8eb094..24ec0dd 100644
--- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -29,6 +29,11 @@
 #include <sstream>
 #include <vector>
 
+#include "arrow/api.h"
+#include "arrow/test-util.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/decimal.h"
+
 #include "parquet/api/reader.h"
 #include "parquet/api/writer.h"
 
@@ -36,16 +41,9 @@
 #include "parquet/arrow/schema.h"
 #include "parquet/arrow/test-util.h"
 #include "parquet/arrow/writer.h"
-
 #include "parquet/file_writer.h"
-
 #include "parquet/util/test-common.h"
 
-#include "arrow/api.h"
-#include "arrow/test-util.h"
-#include "arrow/type_traits.h"
-#include "arrow/util/decimal.h"
-
 using arrow::Array;
 using arrow::ArrayVisitor;
 using arrow::Buffer;
diff --git a/cpp/src/parquet/arrow/test-util.h b/cpp/src/parquet/arrow/test-util.h
index 097e369..1e9390b 100644
--- a/cpp/src/parquet/arrow/test-util.h
+++ b/cpp/src/parquet/arrow/test-util.h
@@ -15,8 +15,11 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#pragma once
+
 #include <limits>
 #include <memory>
+#include <random>
 #include <string>
 #include <utility>
 #include <vector>
@@ -28,14 +31,6 @@
 
 #include "parquet/arrow/record_reader.h"
 
-namespace arrow {
-// PARQUET-1382: backwards-compatible shim for arrow::test namespace
-namespace test {}
-
-using namespace ::arrow::test;  // NOLINT
-
-}  // namespace arrow
-
 namespace parquet {
 
 using internal::RecordReader;
@@ -146,7 +141,7 @@ static inline void random_decimals(int64_t n, uint32_t seed, int32_t precision,
                                    uint8_t* out) {
   std::mt19937 gen(seed);
   std::uniform_int_distribution<uint32_t> d(0, std::numeric_limits<uint8_t>::max());
-  const int32_t required_bytes = DecimalSize(precision);
+  const int32_t required_bytes = ::arrow::DecimalSize(precision);
   constexpr int32_t byte_width = 16;
   std::fill(out, out + byte_width * n, '\0');
 
@@ -433,14 +428,13 @@ Status MakeEmptyListsArray(int64_t size, std::shared_ptr<Array>* out_array) {
   return Status::OK();
 }
 
-static std::shared_ptr<::arrow::Column> MakeColumn(const std::string& name,
-                                                   const std::shared_ptr<Array>& array,
-                                                   bool nullable) {
+static inline std::shared_ptr<::arrow::Column> MakeColumn(
+    const std::string& name, const std::shared_ptr<Array>& array, bool nullable) {
   auto field = ::arrow::field(name, array->type(), nullable);
   return std::make_shared<::arrow::Column>(field, array);
 }
 
-static std::shared_ptr<::arrow::Column> MakeColumn(
+static inline std::shared_ptr<::arrow::Column> MakeColumn(
     const std::string& name, const std::vector<std::shared_ptr<Array>>& arrays,
     bool nullable) {
   auto field = ::arrow::field(name, arrays[0]->type(), nullable);
diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc
index f5e234d..ef5de07 100644
--- a/cpp/src/parquet/arrow/writer.cc
+++ b/cpp/src/parquet/arrow/writer.cc
@@ -504,7 +504,7 @@ Status ArrowColumnWriter::WriteNullableBatch(
   using ParquetCType = typename ParquetType::c_type;
 
   ParquetCType* buffer;
-  RETURN_NOT_OK(ctx_->GetScratchData<ParquetCType>(num_levels, &buffer));
+  RETURN_NOT_OK(ctx_->GetScratchData<ParquetCType>(num_values, &buffer));
   for (int i = 0; i < num_values; i++) {
     buffer[i] = static_cast<ParquetCType>(values[i]);
   }
diff --git a/cpp/src/parquet/column_writer-test.cc b/cpp/src/parquet/column_writer-test.cc
index b81f3ed..4416e3d 100644
--- a/cpp/src/parquet/column_writer-test.cc
+++ b/cpp/src/parquet/column_writer-test.cc
@@ -17,6 +17,8 @@
 
 #include <gtest/gtest.h>
 
+#include <arrow/test-util.h>
+
 #include "parquet/column_reader.h"
 #include "parquet/column_writer.h"
 #include "parquet/test-specialization.h"
@@ -28,6 +30,7 @@
 
 namespace parquet {
 
+using schema::GroupNode;
 using schema::NodePtr;
 using schema::PrimitiveNode;
 
@@ -581,6 +584,52 @@ TEST_F(TestByteArrayValuesWriter, CheckDefaultStats) {
   ASSERT_TRUE(this->metadata_is_stats_set());
 }
 
+TEST(TestColumnWriter, RepeatedListsUpdateSpacedBug) {
+  // In ARROW-3930 we discovered a bug when writing from Arrow when we had data
+  // that looks like this:
+  //
+  // [null, [0, 1, null, 2, 3, 4, null]]
+
+  // Create schema
+  NodePtr item = schema::Int32("item");  // optional item
+  NodePtr list(GroupNode::Make("b", Repetition::REPEATED, {item}, LogicalType::LIST));
+  NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list}));  // optional list
+  std::vector<NodePtr> fields = {bag};
+  NodePtr root = GroupNode::Make("schema", Repetition::REPEATED, fields);
+
+  SchemaDescriptor schema;
+  schema.Init(root);
+
+  InMemoryOutputStream sink;
+  auto props = WriterProperties::Builder().build();
+
+  auto metadata = ColumnChunkMetaDataBuilder::Make(props, schema.Column(0));
+  std::unique_ptr<PageWriter> pager =
+      PageWriter::Open(&sink, Compression::UNCOMPRESSED, metadata.get());
+  std::shared_ptr<ColumnWriter> writer =
+      ColumnWriter::Make(metadata.get(), std::move(pager), props.get());
+  auto typed_writer = std::static_pointer_cast<TypedColumnWriter<Int32Type>>(writer);
+
+  std::vector<int16_t> def_levels = {1, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3};
+  std::vector<int16_t> rep_levels = {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  std::vector<int32_t> values = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+
+  // Write the values into uninitialized memory
+  std::shared_ptr<Buffer> values_buffer;
+  ASSERT_OK(::arrow::AllocateBuffer(64, &values_buffer));
+  memcpy(values_buffer->mutable_data(), values.data(), 13 * sizeof(int32_t));
+  auto values_data = reinterpret_cast<const int32_t*>(values_buffer->data());
+
+  std::shared_ptr<Buffer> valid_bits;
+  ASSERT_OK(::arrow::BitUtil::BytesToBits({1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1},
+                                          ::arrow::default_memory_pool(), &valid_bits));
+
+  // valgrind will warn about out of bounds access into def_levels_data
+  typed_writer->WriteBatchSpaced(14, def_levels.data(), rep_levels.data(),
+                                 valid_bits->data(), 0, values_data);
+  writer->Close();
+}
+
 void GenerateLevels(int min_repeat_factor, int max_repeat_factor, int max_level,
                     std::vector<int16_t>& input_levels) {
   // for each repetition count upto max_repeat_factor
diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc
index 857673d..37fce9c 100644
--- a/cpp/src/parquet/column_writer.cc
+++ b/cpp/src/parquet/column_writer.cc
@@ -717,7 +717,7 @@ inline int64_t TypedColumnWriter<DType>::WriteMiniBatch(int64_t num_values,
 
 template <typename DType>
 inline int64_t TypedColumnWriter<DType>::WriteMiniBatchSpaced(
-    int64_t num_values, const int16_t* def_levels, const int16_t* rep_levels,
+    int64_t num_levels, const int16_t* def_levels, const int16_t* rep_levels,
     const uint8_t* valid_bits, int64_t valid_bits_offset, const T* values,
     int64_t* num_spaced_written) {
   int64_t values_to_write = 0;
@@ -729,7 +729,7 @@ inline int64_t TypedColumnWriter<DType>::WriteMiniBatchSpaced(
     if (descr_->schema_node()->is_optional()) {
       min_spaced_def_level--;
     }
-    for (int64_t i = 0; i < num_values; ++i) {
+    for (int64_t i = 0; i < num_levels; ++i) {
       if (def_levels[i] == descr_->max_definition_level()) {
         ++values_to_write;
       }
@@ -738,27 +738,27 @@ inline int64_t TypedColumnWriter<DType>::WriteMiniBatchSpaced(
       }
     }
 
-    WriteDefinitionLevels(num_values, def_levels);
+    WriteDefinitionLevels(num_levels, def_levels);
   } else {
     // Required field, write all values
-    values_to_write = num_values;
-    spaced_values_to_write = num_values;
+    values_to_write = num_levels;
+    spaced_values_to_write = num_levels;
   }
 
   // Not present for non-repeated fields
   if (descr_->max_repetition_level() > 0) {
     // A row could include more than one value
     // Count the occasions where we start a new row
-    for (int64_t i = 0; i < num_values; ++i) {
+    for (int64_t i = 0; i < num_levels; ++i) {
       if (rep_levels[i] == 0) {
         rows_written_++;
       }
     }
 
-    WriteRepetitionLevels(num_values, rep_levels);
+    WriteRepetitionLevels(num_levels, rep_levels);
   } else {
     // Each value is exactly one row
-    rows_written_ += static_cast<int>(num_values);
+    rows_written_ += static_cast<int>(num_levels);
   }
 
   if (descr_->schema_node()->is_optional()) {
@@ -770,10 +770,10 @@ inline int64_t TypedColumnWriter<DType>::WriteMiniBatchSpaced(
 
   if (page_statistics_ != nullptr) {
     page_statistics_->UpdateSpaced(values, valid_bits, valid_bits_offset, values_to_write,
-                                   num_values - values_to_write);
+                                   spaced_values_to_write - values_to_write);
   }
 
-  num_buffered_values_ += num_values;
+  num_buffered_values_ += num_levels;
   num_buffered_encoded_values_ += values_to_write;
 
   if (current_encoder_->EstimatedDataEncodedSize() >= properties_->data_pagesize()) {
diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h
index 3c69dd3..e665ca7 100644
--- a/cpp/src/parquet/column_writer.h
+++ b/cpp/src/parquet/column_writer.h
@@ -15,8 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#ifndef PARQUET_COLUMN_WRITER_H
-#define PARQUET_COLUMN_WRITER_H
+#pragma once
 
 #include <memory>
 #include <vector>
@@ -330,5 +329,3 @@ PARQUET_EXTERN_TEMPLATE TypedColumnWriter<ByteArrayType>;
 PARQUET_EXTERN_TEMPLATE TypedColumnWriter<FLBAType>;
 
 }  // namespace parquet
-
-#endif  // PARQUET_COLUMN_READER_H
diff --git a/cpp/src/parquet/test-specialization.h b/cpp/src/parquet/test-specialization.h
index 3d88cfc..55d2374 100644
--- a/cpp/src/parquet/test-specialization.h
+++ b/cpp/src/parquet/test-specialization.h
@@ -19,8 +19,7 @@
 // Parquet column chunk within a row group. It could be extended in the future
 // to iterate through all data pages in all chunks in a file.
 
-#ifndef PARQUET_COLUMN_TEST_SPECIALIZATION_H
-#define PARQUET_COLUMN_TEST_SPECIALIZATION_H
+#pragma once
 
 #include <algorithm>
 #include <limits>
@@ -179,5 +178,3 @@ void PrimitiveTypedTest<BooleanType>::GenerateData(int64_t num_values) {
 }  // namespace test
 
 }  // namespace parquet
-
-#endif  // PARQUET_COLUMN_TEST_SPECIALIZATION_H
diff --git a/cpp/src/parquet/test-util.h b/cpp/src/parquet/test-util.h
index 92aa8d3..ab9c50a 100644
--- a/cpp/src/parquet/test-util.h
+++ b/cpp/src/parquet/test-util.h
@@ -19,8 +19,7 @@
 // Parquet column chunk within a row group. It could be extended in the future
 // to iterate through all data pages in all chunks in a file.
 
-#ifndef PARQUET_COLUMN_TEST_UTIL_H
-#define PARQUET_COLUMN_TEST_UTIL_H
+#pragma once
 
 #include <algorithm>
 #include <limits>
@@ -442,5 +441,3 @@ static int MakePages(const ColumnDescriptor* d, int num_pages, int levels_per_pa
 }  // namespace test
 
 }  // namespace parquet
-
-#endif  // PARQUET_COLUMN_TEST_UTIL_H