You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by uw...@apache.org on 2018/04/18 10:55:31 UTC

[parquet-cpp] branch master updated: PARQUET-1274: Prevent segfault that was occurring when writing a nanosecond timestamp with arrow writer properties set to coerce timestamps and support deprecated int96 timestamps.

This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/parquet-cpp.git


The following commit(s) were added to refs/heads/master by this push:
     new 0c1f5c5  PARQUET-1274: Prevent segfault that was occurring when writing a nanosecond timestamp with arrow writer properties set to coerce timestamps and support deprecated int96 timestamps.
0c1f5c5 is described below

commit 0c1f5c51f0af08509dbc229339310c2915b1df18
Author: Joshua Storck <jo...@twosigma.com>
AuthorDate: Wed Apr 18 12:55:23 2018 +0200

    PARQUET-1274: Prevent segfault that was occurring when writing a nanosecond timestamp with arrow writer properties set to coerce timestamps and support deprecated int96 timestamps.
    
    The bug was a due to the fact that the physical type was int64 but the WriteTimestamps function was taking a path that assumed the physical type was int96. This caused memory corruption because it was writing past the end of the array. The bug was fixed by checking that coerce timestamps is disabled when writing int96.
    
    A unit test was added for the regression.
    
    Author: Joshua Storck <jo...@twosigma.com>
    
    Closes #456 from joshuastorck/ARROW_2082 and squashes the following commits:
    
    5fa0a94 [Joshua Storck] Removing 'using ::arrow' in favor of using ::arrow::SomeType
    9725ecc [Joshua Storck] Bug fix for ARROW-2082, in which a segfault was being encountered when writing a nanosecond timestamp column with arrow writer properties set to coerce timestamps and support deprecated int96 timestamps. The bug was a segfault due to the fact that the physical type was int64 but the WriteTimestamps function was taking a path that assumed the physical type was int96. The bug was fixed by checking that coerce timestamps is disabled when writing int96. A unit test [...]
---
 src/parquet/arrow/arrow-reader-writer-test.cc | 58 +++++++++++++++++++++++++++
 src/parquet/arrow/writer.cc                   |  9 ++++-
 2 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc
index 93ecd3c..cb38b8f 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -1436,6 +1436,64 @@ TEST(TestArrowReadWrite, ConvertedDateTimeTypes) {
   AssertTablesEqual(*ex_table, *result);
 }
 
+// Regression for ARROW-2802
+TEST(TestArrowReadWrite, CoerceTimestampsAndSupportDeprecatedInt96) {
+  using ::arrow::Column;
+  using ::arrow::Field;
+  using ::arrow::Schema;
+  using ::arrow::Table;
+  using ::arrow::TimeUnit;
+  using ::arrow::TimestampType;
+  using ::arrow::TimestampBuilder;
+  using ::arrow::default_memory_pool;
+
+  auto timestamp_type = std::make_shared<TimestampType>(TimeUnit::NANO);
+
+  TimestampBuilder builder(timestamp_type, default_memory_pool());
+  for (std::int64_t ii = 0; ii < 10; ++ii) {
+    ASSERT_OK(builder.Append(1000000000L * ii));
+  }
+  std::shared_ptr<Array> values;
+  ASSERT_OK(builder.Finish(&values));
+
+  std::vector<std::shared_ptr<Field>> fields;
+  auto field = std::make_shared<Field>("nanos", timestamp_type);
+  fields.emplace_back(field);
+
+  auto schema = std::make_shared<Schema>(fields);
+
+  std::vector<std::shared_ptr<Column>> columns;
+  auto column = std::make_shared<Column>("nanos", values);
+  columns.emplace_back(column);
+
+  auto table = Table::Make(schema, columns);
+
+  auto arrow_writer_properties = ArrowWriterProperties::Builder()
+                                     .coerce_timestamps(TimeUnit::MICRO)
+                                     ->enable_deprecated_int96_timestamps()
+                                     ->build();
+
+  std::shared_ptr<Table> result;
+  DoSimpleRoundtrip(table, 1, table->num_rows(), {}, &result, arrow_writer_properties);
+
+  ASSERT_EQ(table->num_columns(), result->num_columns());
+  ASSERT_EQ(table->num_rows(), result->num_rows());
+
+  auto actual_column = result->column(0);
+  auto data = actual_column->data();
+  auto expected_values =
+      static_cast<::arrow::NumericArray<TimestampType>*>(values.get())->raw_values();
+  for (int ii = 0; ii < data->num_chunks(); ++ii) {
+    auto chunk =
+        static_cast<::arrow::NumericArray<TimestampType>*>(data->chunk(ii).get());
+    auto values = chunk->raw_values();
+    for (int64_t jj = 0; jj < chunk->length(); ++jj, ++expected_values) {
+      // Check that the nanos have been converted to micros
+      ASSERT_EQ(*expected_values / 1000, values[jj]);
+    }
+  }
+}
+
 void MakeDoubleTable(int num_columns, int num_rows, int nchunks,
                      std::shared_ptr<Table>* out) {
   std::shared_ptr<::arrow::Column> column;
diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc
index 14e9c6e..50b4649 100644
--- a/src/parquet/arrow/writer.cc
+++ b/src/parquet/arrow/writer.cc
@@ -602,7 +602,14 @@ Status ArrowColumnWriter::WriteTimestamps(const Array& values, int64_t num_level
 
   const bool is_nanosecond = type.unit() == TimeUnit::NANO;
 
-  if (is_nanosecond && ctx_->properties->support_deprecated_int96_timestamps()) {
+  // In the case where support_deprecated_int96_timestamps was specified
+  // and coerce_timestamps_enabled was specified, a nanosecond column
+  // will have a physical type of int64. In that case, we fall through
+  // to the else if below.
+  //
+  // See https://issues.apache.org/jira/browse/ARROW-2082
+  if (is_nanosecond && ctx_->properties->support_deprecated_int96_timestamps() &&
+      !ctx_->properties->coerce_timestamps_enabled()) {
     return TypedWriteBatch<Int96Type, ::arrow::TimestampType>(values, num_levels,
                                                               def_levels, rep_levels);
   } else if (is_nanosecond ||

-- 
To stop receiving notification emails like this one, please contact
uwe@apache.org.