You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by we...@apache.org on 2017/07/11 20:20:55 UTC

parquet-cpp git commit: PARQUET-1054: Fixes for Arrow API changes in ARROW-1199

Repository: parquet-cpp
Updated Branches:
  refs/heads/master 3dedeed1d -> 18b504482


PARQUET-1054: Fixes for Arrow API changes in ARROW-1199

This will be broken until ARROW-1199 is merged

Author: Wes McKinney <we...@twosigma.com>

Closes #370 from wesm/arrow-api-changes and squashes the following commits:

6d10f87 [Wes McKinney] Fix decode_benchmark
18cbac7 [Wes McKinney] Update Arrow version
0f11953 [Wes McKinney] Restore prior parameters in parquet 1.0 compatibility test
1c35d49 [Wes McKinney] Fix toolchain lib paths
3914214 [Wes McKinney] Update Arrow EP to exclude lz4/zstd
d033f25 [Wes McKinney] Active conda toolchain environment
c1477a4 [Wes McKinney] Update Arrow version to include ARROW-1199
1d004fa [Wes McKinney] Fixes for Arrow API changes in ARROW-1199


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/18b50448
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/18b50448
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/18b50448

Branch: refs/heads/master
Commit: 18b504482be09ee4da37f5f825d68c4b8b00fe70
Parents: 3dedeed
Author: Wes McKinney <we...@twosigma.com>
Authored: Tue Jul 11 16:20:41 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Tue Jul 11 16:20:41 2017 -0400

----------------------------------------------------------------------
 benchmarks/decode_benchmark.cc                |  1 +
 ci/travis_script_static.sh                    | 17 +++++---
 cmake_modules/ThirdpartyToolchain.cmake       |  4 +-
 src/parquet/arrow/arrow-reader-writer-test.cc | 13 ++++---
 src/parquet/arrow/reader.cc                   |  4 +-
 src/parquet/arrow/test-util.h                 |  5 ++-
 src/parquet/arrow/writer.cc                   | 45 ++++++++--------------
 src/parquet/file/writer.cc                    |  1 -
 8 files changed, 43 insertions(+), 47 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/18b50448/benchmarks/decode_benchmark.cc
----------------------------------------------------------------------
diff --git a/benchmarks/decode_benchmark.cc b/benchmarks/decode_benchmark.cc
index 5514d8b..57279d0 100644
--- a/benchmarks/decode_benchmark.cc
+++ b/benchmarks/decode_benchmark.cc
@@ -20,6 +20,7 @@
 #include <stdio.h>
 
 #include "arrow/util/compression.h"
+#include "arrow/util/compression_snappy.h"
 
 #include "parquet/encoding-internal.h"
 #include "parquet/util/logging.h"

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/18b50448/ci/travis_script_static.sh
----------------------------------------------------------------------
diff --git a/ci/travis_script_static.sh b/ci/travis_script_static.sh
index de57acc..4af2653 100755
--- a/ci/travis_script_static.sh
+++ b/ci/travis_script_static.sh
@@ -43,6 +43,8 @@ conda create -y -q -p $CPP_TOOLCHAIN \
       boost-cpp thrift-cpp cmake git \
       -c conda-forge
 
+source activate $CPP_TOOLCHAIN
+
 # ----------------------------------------------------------------------
 
 : ${CPP_BUILD_DIR=$TRAVIS_BUILD_DIR/parquet-build}
@@ -51,12 +53,15 @@ export PARQUET_BUILD_TOOLCHAIN=$CPP_TOOLCHAIN
 export LD_LIBRARY_PATH=$CPP_TOOLCHAIN/lib:$LD_LIBRARY_PATH
 export BOOST_ROOT=$CPP_TOOLCHAIN
 export PARQUET_TEST_DATA=$TRAVIS_BUILD_DIR/data
-export SNAPPY_STATIC_LIB=$TRAVIS_BUILD_DIR/parquet-build/arrow_ep-prefix/src/arrow_ep-build/snappy_ep/src/snappy_ep-install/lib/libsnappy.a
-export BROTLI_STATIC_LIB_ENC=$TRAVIS_BUILD_DIR/parquet-build/arrow_ep-prefix/src/arrow_ep-build/brotli_ep/src/brotli_ep-install/lib/x86_64-linux-gnu/libbrotlienc.a
-export BROTLI_STATIC_LIB_DEC=$TRAVIS_BUILD_DIR/parquet-build/arrow_ep-prefix/src/arrow_ep-build/brotli_ep/src/brotli_ep-install/lib/x86_64-linux-gnu/libbrotlidec.a
-export BROTLI_STATIC_LIB_COMMON=$TRAVIS_BUILD_DIR/parquet-build/arrow_ep-prefix/src/arrow_ep-build/brotli_ep/src/brotli_ep-install/lib/x86_64-linux-gnu/libbrotlicommon.a
-export ZLIB_STATIC_LIB=$TRAVIS_BUILD_DIR/parquet-build/arrow_ep-prefix/src/arrow_ep-build/zlib_ep/src/zlib_ep-install/lib/libz.a
 
+ARROW_EP=$TRAVIS_BUILD_DIR/parquet-build/arrow_ep-prefix/src/arrow_ep-build
+BROTLI_EP=$ARROW_EP/brotli_ep/src/brotli_ep-install/lib/x86_64-linux-gnu
+
+export SNAPPY_STATIC_LIB=$ARROW_EP/snappy_ep/src/snappy_ep-install/lib/libsnappy.a
+export BROTLI_STATIC_LIB_ENC=$BROTLI_EP/libbrotlienc.a
+export BROTLI_STATIC_LIB_DEC=$BROTLI_EP/libbrotlidec.a
+export BROTLI_STATIC_LIB_COMMON=$BROTLI_EP/libbrotlicommon.a
+export ZLIB_STATIC_LIB=$ARROW_EP/zlib_ep/src/zlib_ep-install/lib/libz.a
 
 cmake -DPARQUET_CXXFLAGS=-Werror \
       -DPARQUET_TEST_MEMCHECK=ON \
@@ -70,7 +75,7 @@ cmake -DPARQUET_CXXFLAGS=-Werror \
 
 pushd $CPP_BUILD_DIR
 
-make -j4 || exit 1
+make -j4 VERBOSE=1 || exit 1
 ctest -VV -L unittest || { cat $TRAVIS_BUILD_DIR/parquet-build/Testing/Temporary/LastTest.log; exit 1; }
 
 popd

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/18b50448/cmake_modules/ThirdpartyToolchain.cmake
----------------------------------------------------------------------
diff --git a/cmake_modules/ThirdpartyToolchain.cmake b/cmake_modules/ThirdpartyToolchain.cmake
index 85630a4..2717fb5 100644
--- a/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cmake_modules/ThirdpartyToolchain.cmake
@@ -332,12 +332,14 @@ if (NOT ARROW_FOUND)
     -DCMAKE_INSTALL_LIBDIR=${ARROW_LIB_DIR}
     -DARROW_JEMALLOC=OFF
     -DARROW_IPC=OFF
+    -DARROW_WITH_LZ4=OFF
+    -DARROW_WITH_ZSTD=OFF
     -DARROW_BUILD_SHARED=${PARQUET_BUILD_SHARED}
     -DARROW_BOOST_USE_SHARED=${PARQUET_BOOST_USE_SHARED}
     -DARROW_BUILD_TESTS=OFF)
 
   if ("$ENV{PARQUET_ARROW_VERSION}" STREQUAL "")
-    set(ARROW_VERSION "a58893882ac8acd1ac4a5036685cbf09a9a09673")
+    set(ARROW_VERSION "afb192824a75ab81fbc8dcd2da56409186bb23e0")
   else()
     set(ARROW_VERSION "$ENV{PARQUET_ARROW_VERSION}")
   endif()

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/18b50448/src/parquet/arrow/arrow-reader-writer-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc
index ddc197c..746ce14 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -421,7 +421,9 @@ class TestParquetIO : public ::testing::Test {
 
     std::shared_ptr<ChunkedArray> chunked_array = out->column(0)->data();
     ASSERT_EQ(1, chunked_array->num_chunks());
-    ASSERT_TRUE(values->Equals(chunked_array->chunk(0)));
+    auto result = chunked_array->chunk(0);
+
+    ASSERT_TRUE(values->Equals(result));
   }
 
   void CheckRoundTrip(const std::shared_ptr<Table>& table) {
@@ -762,7 +764,7 @@ TEST_F(TestUInt32ParquetIO, Parquet_1_0_Compability) {
     ASSERT_OK(int64_data->Resize(sizeof(int64_t) * values->length()));
     int64_t* int64_data_ptr = reinterpret_cast<int64_t*>(int64_data->mutable_data());
     const uint32_t* uint32_data_ptr =
-        reinterpret_cast<const uint32_t*>(values->data()->data());
+        reinterpret_cast<const uint32_t*>(values->values()->data());
     // std::copy might be faster but this is explicit on the casts)
     for (int64_t i = 0; i < values->length(); i++) {
       int64_data_ptr[i] = static_cast<int64_t>(uint32_data_ptr[i]);
@@ -1219,7 +1221,7 @@ class TestNestedSchemaRead : public ::testing::TestWithParam<Repetition::type> {
 
     // Produce values for the columns
     MakeValues(NUM_SIMPLE_TEST_ROWS);
-    int32_t* values = reinterpret_cast<int32_t*>(values_array_->data()->mutable_data());
+    int32_t* values = reinterpret_cast<int32_t*>(values_array_->values()->mutable_data());
 
     // Create the actual parquet file
     InitNewParquetFile(
@@ -1283,7 +1285,7 @@ class TestNestedSchemaRead : public ::testing::TestWithParam<Repetition::type> {
 
     // Produce values for the columns
     MakeValues(num_rows);
-    int32_t* values = reinterpret_cast<int32_t*>(values_array_->data()->mutable_data());
+    int32_t* values = reinterpret_cast<int32_t*>(values_array_->values()->mutable_data());
 
     // Create the actual parquet file
     InitNewParquetFile(std::static_pointer_cast<GroupNode>(schema_node), num_rows);
@@ -1323,7 +1325,8 @@ class TestNestedSchemaRead : public ::testing::TestWithParam<Repetition::type> {
     }
 
     virtual Status Visit(const ::arrow::StructArray& array) {
-      for (auto& child : array.fields()) {
+      for (int32_t i = 0; i < array.num_fields(); ++i) {
+        auto child = array.field(i);
         if (node_repetition_ == Repetition::REQUIRED) {
           RETURN_NOT_OK(child->Accept(this));
         } else if (node_repetition_ == Repetition::OPTIONAL) {

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/18b50448/src/parquet/arrow/reader.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/reader.cc b/src/parquet/arrow/reader.cc
index 9d7ceed..e941c1f 100644
--- a/src/parquet/arrow/reader.cc
+++ b/src/parquet/arrow/reader.cc
@@ -974,9 +974,9 @@ Status PrimitiveImpl::WrapIntoListArray(const int16_t* def_levels,
       list_lengths.push_back(offset_builders[j]->length() - 1);
       std::shared_ptr<Array> array;
       RETURN_NOT_OK(offset_builders[j]->Finish(&array));
-      offsets.emplace_back(std::static_pointer_cast<Int32Array>(array)->data());
+      offsets.emplace_back(std::static_pointer_cast<Int32Array>(array)->values());
       RETURN_NOT_OK(valid_bits_builders[j]->Finish(&array));
-      valid_bits.emplace_back(std::static_pointer_cast<BooleanArray>(array)->data());
+      valid_bits.emplace_back(std::static_pointer_cast<BooleanArray>(array)->values());
     }
 
     std::shared_ptr<Array> output(*array);

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/18b50448/src/parquet/arrow/test-util.h
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/test-util.h b/src/parquet/arrow/test-util.h
index 191a399..946afad 100644
--- a/src/parquet/arrow/test-util.h
+++ b/src/parquet/arrow/test-util.h
@@ -332,7 +332,7 @@ template <typename T>
 void ExpectArray(T* expected, Array* result) {
   auto p_array = static_cast<::arrow::PrimitiveArray*>(result);
   for (int i = 0; i < result->length(); i++) {
-    EXPECT_EQ(expected[i], reinterpret_cast<const T*>(p_array->data()->data())[i]);
+    EXPECT_EQ(expected[i], reinterpret_cast<const T*>(p_array->values()->data())[i]);
   }
 }
 
@@ -341,7 +341,8 @@ void ExpectArrayT(void* expected, Array* result) {
   ::arrow::PrimitiveArray* p_array = static_cast<::arrow::PrimitiveArray*>(result);
   for (int64_t i = 0; i < result->length(); i++) {
     EXPECT_EQ(reinterpret_cast<typename ArrowType::c_type*>(expected)[i],
-        reinterpret_cast<const typename ArrowType::c_type*>(p_array->data()->data())[i]);
+        reinterpret_cast<const typename ArrowType::c_type*>(
+            p_array->values()->data())[i]);
   }
 }
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/18b50448/src/parquet/arrow/writer.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc
index a7aebac..104c040 100644
--- a/src/parquet/arrow/writer.cc
+++ b/src/parquet/arrow/writer.cc
@@ -61,30 +61,14 @@ class LevelBuilder {
 
   Status VisitInline(const Array& array);
 
-  Status Visit(const ::arrow::NullArray& array) {
-    array_offsets_.push_back(static_cast<int32_t>(array.offset()));
-    valid_bitmaps_.push_back(array.null_bitmap_data());
-    null_counts_.push_back(array.length());
-    values_type_ = array.type_id();
-    values_array_ = &array;
-    return Status::OK();
-  }
-
-  Status Visit(const ::arrow::PrimitiveArray& array) {
-    array_offsets_.push_back(static_cast<int32_t>(array.offset()));
-    valid_bitmaps_.push_back(array.null_bitmap_data());
-    null_counts_.push_back(array.null_count());
-    values_type_ = array.type_id();
-    values_array_ = &array;
-    return Status::OK();
-  }
-
-  Status Visit(const ::arrow::BinaryArray& array) {
+  template <typename T>
+  typename std::enable_if<std::is_base_of<::arrow::FlatArray, T>::value, Status>::type
+  Visit(const T& array) {
     array_offsets_.push_back(static_cast<int32_t>(array.offset()));
     valid_bitmaps_.push_back(array.null_bitmap_data());
     null_counts_.push_back(array.null_count());
     values_type_ = array.type_id();
-    values_array_ = &array;
+    values_array_ = std::make_shared<T>(array.data());
     return Status::OK();
   }
 
@@ -115,7 +99,7 @@ class LevelBuilder {
   Status GenerateLevels(const Array& array, const std::shared_ptr<Field>& field,
       int64_t* values_offset, ::arrow::Type::type* values_type, int64_t* num_values,
       int64_t* num_levels, std::shared_ptr<Buffer>* def_levels,
-      std::shared_ptr<Buffer>* rep_levels, const Array** values_array) {
+      std::shared_ptr<Buffer>* rep_levels, std::shared_ptr<Array>* values_array) {
     // Work downwards to extract bitmaps and offsets
     min_offset_idx_ = 0;
     max_offset_idx_ = static_cast<int32_t>(array.length());
@@ -173,11 +157,11 @@ class LevelBuilder {
 
       std::shared_ptr<Array> def_levels_array;
       RETURN_NOT_OK(def_levels_.Finish(&def_levels_array));
-      *def_levels = static_cast<PrimitiveArray*>(def_levels_array.get())->data();
+      *def_levels = static_cast<PrimitiveArray*>(def_levels_array.get())->values();
 
       std::shared_ptr<Array> rep_levels_array;
       RETURN_NOT_OK(rep_levels_.Finish(&rep_levels_array));
-      *rep_levels = static_cast<PrimitiveArray*>(rep_levels_array.get())->data();
+      *rep_levels = static_cast<PrimitiveArray*>(rep_levels_array.get())->values();
       *num_levels = rep_levels_array->length();
     }
 
@@ -248,7 +232,7 @@ class LevelBuilder {
   int32_t min_offset_idx_;
   int32_t max_offset_idx_;
   ::arrow::Type::type values_type_;
-  const Array* values_array_;
+  std::shared_ptr<Array> values_array_;
 };
 
 Status LevelBuilder::VisitInline(const Array& array) {
@@ -311,7 +295,7 @@ Status FileWriter::Impl::TypedWriteBatch(ColumnWriter* column_writer,
   using ArrowCType = typename ArrowType::c_type;
 
   auto data = static_cast<const PrimitiveArray*>(array.get());
-  auto data_ptr = reinterpret_cast<const ArrowCType*>(data->data()->data());
+  auto data_ptr = reinterpret_cast<const ArrowCType*>(data->values()->data());
   auto writer = reinterpret_cast<TypedColumnWriter<ParquetType>*>(column_writer);
 
   if (writer->descr()->schema_node()->is_required() || (data->null_count() == 0)) {
@@ -501,7 +485,7 @@ Status FileWriter::Impl::TypedWriteBatch<BooleanType, ::arrow::BooleanType>(
     const int16_t* def_levels, const int16_t* rep_levels) {
   RETURN_NOT_OK(data_buffer_.Resize(array->length()));
   auto data = static_cast<const BooleanArray*>(array.get());
-  auto data_ptr = reinterpret_cast<const uint8_t*>(data->data()->data());
+  auto data_ptr = reinterpret_cast<const uint8_t*>(data->values()->data());
   auto buffer_ptr = reinterpret_cast<bool*>(data_buffer_.mutable_data());
   auto writer = reinterpret_cast<TypedColumnWriter<BooleanType>*>(column_writer);
 
@@ -540,8 +524,8 @@ Status FileWriter::Impl::TypedWriteBatch<ByteArrayType, ::arrow::BinaryType>(
   // data->data() points already to a nullptr, thus data->data()->data() will
   // segfault.
   const uint8_t* data_ptr = nullptr;
-  if (data->data()) {
-    data_ptr = reinterpret_cast<const uint8_t*>(data->data()->data());
+  if (data->value_data()) {
+    data_ptr = reinterpret_cast<const uint8_t*>(data->value_data()->data());
     DCHECK(data_ptr != nullptr);
   }
   auto writer = reinterpret_cast<TypedColumnWriter<ByteArrayType>*>(column_writer);
@@ -620,14 +604,15 @@ Status FileWriter::Impl::WriteColumnChunk(const Array& data) {
   std::shared_ptr<::arrow::Schema> arrow_schema;
   RETURN_NOT_OK(FromParquetSchema(writer_->schema(), {current_column_idx - 1},
       writer_->key_value_metadata(), &arrow_schema));
-  LevelBuilder level_builder(pool_);
   std::shared_ptr<Buffer> def_levels_buffer;
   std::shared_ptr<Buffer> rep_levels_buffer;
   int64_t values_offset;
   ::arrow::Type::type values_type;
   int64_t num_levels;
   int64_t num_values;
-  const Array* _values_array;
+
+  std::shared_ptr<Array> _values_array;
+  LevelBuilder level_builder(pool_);
   RETURN_NOT_OK(level_builder.GenerateLevels(data, arrow_schema->field(0), &values_offset,
       &values_type, &num_values, &num_levels, &def_levels_buffer, &rep_levels_buffer,
       &_values_array));

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/18b50448/src/parquet/file/writer.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/writer.cc b/src/parquet/file/writer.cc
index cafa6b8..d52c25c 100644
--- a/src/parquet/file/writer.cc
+++ b/src/parquet/file/writer.cc
@@ -53,7 +53,6 @@ int64_t RowGroupWriter::num_rows() const {
   return contents_->num_rows();
 }
 
-
 // ----------------------------------------------------------------------
 // ParquetFileWriter public API