You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by we...@apache.org on 2017/07/11 20:20:55 UTC
parquet-cpp git commit: PARQUET-1054: Fixes for Arrow API changes in
ARROW-1199
Repository: parquet-cpp
Updated Branches:
refs/heads/master 3dedeed1d -> 18b504482
PARQUET-1054: Fixes for Arrow API changes in ARROW-1199
This will be broken until ARROW-1199 is merged
Author: Wes McKinney <we...@twosigma.com>
Closes #370 from wesm/arrow-api-changes and squashes the following commits:
6d10f87 [Wes McKinney] Fix decode_benchmark
18cbac7 [Wes McKinney] Update Arrow version
0f11953 [Wes McKinney] Restore prior parameters in parquet 1.0 compatibility test
1c35d49 [Wes McKinney] Fix toolchain lib paths
3914214 [Wes McKinney] Update Arrow EP to exclude lz4/zstd
d033f25 [Wes McKinney] Active conda toolchain environment
c1477a4 [Wes McKinney] Update Arrow version to include ARROW-1199
1d004fa [Wes McKinney] Fixes for Arrow API changes in ARROW-1199
Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/18b50448
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/18b50448
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/18b50448
Branch: refs/heads/master
Commit: 18b504482be09ee4da37f5f825d68c4b8b00fe70
Parents: 3dedeed
Author: Wes McKinney <we...@twosigma.com>
Authored: Tue Jul 11 16:20:41 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Tue Jul 11 16:20:41 2017 -0400
----------------------------------------------------------------------
benchmarks/decode_benchmark.cc | 1 +
ci/travis_script_static.sh | 17 +++++---
cmake_modules/ThirdpartyToolchain.cmake | 4 +-
src/parquet/arrow/arrow-reader-writer-test.cc | 13 ++++---
src/parquet/arrow/reader.cc | 4 +-
src/parquet/arrow/test-util.h | 5 ++-
src/parquet/arrow/writer.cc | 45 ++++++++--------------
src/parquet/file/writer.cc | 1 -
8 files changed, 43 insertions(+), 47 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/18b50448/benchmarks/decode_benchmark.cc
----------------------------------------------------------------------
diff --git a/benchmarks/decode_benchmark.cc b/benchmarks/decode_benchmark.cc
index 5514d8b..57279d0 100644
--- a/benchmarks/decode_benchmark.cc
+++ b/benchmarks/decode_benchmark.cc
@@ -20,6 +20,7 @@
#include <stdio.h>
#include "arrow/util/compression.h"
+#include "arrow/util/compression_snappy.h"
#include "parquet/encoding-internal.h"
#include "parquet/util/logging.h"
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/18b50448/ci/travis_script_static.sh
----------------------------------------------------------------------
diff --git a/ci/travis_script_static.sh b/ci/travis_script_static.sh
index de57acc..4af2653 100755
--- a/ci/travis_script_static.sh
+++ b/ci/travis_script_static.sh
@@ -43,6 +43,8 @@ conda create -y -q -p $CPP_TOOLCHAIN \
boost-cpp thrift-cpp cmake git \
-c conda-forge
+source activate $CPP_TOOLCHAIN
+
# ----------------------------------------------------------------------
: ${CPP_BUILD_DIR=$TRAVIS_BUILD_DIR/parquet-build}
@@ -51,12 +53,15 @@ export PARQUET_BUILD_TOOLCHAIN=$CPP_TOOLCHAIN
export LD_LIBRARY_PATH=$CPP_TOOLCHAIN/lib:$LD_LIBRARY_PATH
export BOOST_ROOT=$CPP_TOOLCHAIN
export PARQUET_TEST_DATA=$TRAVIS_BUILD_DIR/data
-export SNAPPY_STATIC_LIB=$TRAVIS_BUILD_DIR/parquet-build/arrow_ep-prefix/src/arrow_ep-build/snappy_ep/src/snappy_ep-install/lib/libsnappy.a
-export BROTLI_STATIC_LIB_ENC=$TRAVIS_BUILD_DIR/parquet-build/arrow_ep-prefix/src/arrow_ep-build/brotli_ep/src/brotli_ep-install/lib/x86_64-linux-gnu/libbrotlienc.a
-export BROTLI_STATIC_LIB_DEC=$TRAVIS_BUILD_DIR/parquet-build/arrow_ep-prefix/src/arrow_ep-build/brotli_ep/src/brotli_ep-install/lib/x86_64-linux-gnu/libbrotlidec.a
-export BROTLI_STATIC_LIB_COMMON=$TRAVIS_BUILD_DIR/parquet-build/arrow_ep-prefix/src/arrow_ep-build/brotli_ep/src/brotli_ep-install/lib/x86_64-linux-gnu/libbrotlicommon.a
-export ZLIB_STATIC_LIB=$TRAVIS_BUILD_DIR/parquet-build/arrow_ep-prefix/src/arrow_ep-build/zlib_ep/src/zlib_ep-install/lib/libz.a
+ARROW_EP=$TRAVIS_BUILD_DIR/parquet-build/arrow_ep-prefix/src/arrow_ep-build
+BROTLI_EP=$ARROW_EP/brotli_ep/src/brotli_ep-install/lib/x86_64-linux-gnu
+
+export SNAPPY_STATIC_LIB=$ARROW_EP/snappy_ep/src/snappy_ep-install/lib/libsnappy.a
+export BROTLI_STATIC_LIB_ENC=$BROTLI_EP/libbrotlienc.a
+export BROTLI_STATIC_LIB_DEC=$BROTLI_EP/libbrotlidec.a
+export BROTLI_STATIC_LIB_COMMON=$BROTLI_EP/libbrotlicommon.a
+export ZLIB_STATIC_LIB=$ARROW_EP/zlib_ep/src/zlib_ep-install/lib/libz.a
cmake -DPARQUET_CXXFLAGS=-Werror \
-DPARQUET_TEST_MEMCHECK=ON \
@@ -70,7 +75,7 @@ cmake -DPARQUET_CXXFLAGS=-Werror \
pushd $CPP_BUILD_DIR
-make -j4 || exit 1
+make -j4 VERBOSE=1 || exit 1
ctest -VV -L unittest || { cat $TRAVIS_BUILD_DIR/parquet-build/Testing/Temporary/LastTest.log; exit 1; }
popd
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/18b50448/cmake_modules/ThirdpartyToolchain.cmake
----------------------------------------------------------------------
diff --git a/cmake_modules/ThirdpartyToolchain.cmake b/cmake_modules/ThirdpartyToolchain.cmake
index 85630a4..2717fb5 100644
--- a/cmake_modules/ThirdpartyToolchain.cmake
+++ b/cmake_modules/ThirdpartyToolchain.cmake
@@ -332,12 +332,14 @@ if (NOT ARROW_FOUND)
-DCMAKE_INSTALL_LIBDIR=${ARROW_LIB_DIR}
-DARROW_JEMALLOC=OFF
-DARROW_IPC=OFF
+ -DARROW_WITH_LZ4=OFF
+ -DARROW_WITH_ZSTD=OFF
-DARROW_BUILD_SHARED=${PARQUET_BUILD_SHARED}
-DARROW_BOOST_USE_SHARED=${PARQUET_BOOST_USE_SHARED}
-DARROW_BUILD_TESTS=OFF)
if ("$ENV{PARQUET_ARROW_VERSION}" STREQUAL "")
- set(ARROW_VERSION "a58893882ac8acd1ac4a5036685cbf09a9a09673")
+ set(ARROW_VERSION "afb192824a75ab81fbc8dcd2da56409186bb23e0")
else()
set(ARROW_VERSION "$ENV{PARQUET_ARROW_VERSION}")
endif()
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/18b50448/src/parquet/arrow/arrow-reader-writer-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc
index ddc197c..746ce14 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -421,7 +421,9 @@ class TestParquetIO : public ::testing::Test {
std::shared_ptr<ChunkedArray> chunked_array = out->column(0)->data();
ASSERT_EQ(1, chunked_array->num_chunks());
- ASSERT_TRUE(values->Equals(chunked_array->chunk(0)));
+ auto result = chunked_array->chunk(0);
+
+ ASSERT_TRUE(values->Equals(result));
}
void CheckRoundTrip(const std::shared_ptr<Table>& table) {
@@ -762,7 +764,7 @@ TEST_F(TestUInt32ParquetIO, Parquet_1_0_Compability) {
ASSERT_OK(int64_data->Resize(sizeof(int64_t) * values->length()));
int64_t* int64_data_ptr = reinterpret_cast<int64_t*>(int64_data->mutable_data());
const uint32_t* uint32_data_ptr =
- reinterpret_cast<const uint32_t*>(values->data()->data());
+ reinterpret_cast<const uint32_t*>(values->values()->data());
// std::copy might be faster but this is explicit on the casts)
for (int64_t i = 0; i < values->length(); i++) {
int64_data_ptr[i] = static_cast<int64_t>(uint32_data_ptr[i]);
@@ -1219,7 +1221,7 @@ class TestNestedSchemaRead : public ::testing::TestWithParam<Repetition::type> {
// Produce values for the columns
MakeValues(NUM_SIMPLE_TEST_ROWS);
- int32_t* values = reinterpret_cast<int32_t*>(values_array_->data()->mutable_data());
+ int32_t* values = reinterpret_cast<int32_t*>(values_array_->values()->mutable_data());
// Create the actual parquet file
InitNewParquetFile(
@@ -1283,7 +1285,7 @@ class TestNestedSchemaRead : public ::testing::TestWithParam<Repetition::type> {
// Produce values for the columns
MakeValues(num_rows);
- int32_t* values = reinterpret_cast<int32_t*>(values_array_->data()->mutable_data());
+ int32_t* values = reinterpret_cast<int32_t*>(values_array_->values()->mutable_data());
// Create the actual parquet file
InitNewParquetFile(std::static_pointer_cast<GroupNode>(schema_node), num_rows);
@@ -1323,7 +1325,8 @@ class TestNestedSchemaRead : public ::testing::TestWithParam<Repetition::type> {
}
virtual Status Visit(const ::arrow::StructArray& array) {
- for (auto& child : array.fields()) {
+ for (int32_t i = 0; i < array.num_fields(); ++i) {
+ auto child = array.field(i);
if (node_repetition_ == Repetition::REQUIRED) {
RETURN_NOT_OK(child->Accept(this));
} else if (node_repetition_ == Repetition::OPTIONAL) {
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/18b50448/src/parquet/arrow/reader.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/reader.cc b/src/parquet/arrow/reader.cc
index 9d7ceed..e941c1f 100644
--- a/src/parquet/arrow/reader.cc
+++ b/src/parquet/arrow/reader.cc
@@ -974,9 +974,9 @@ Status PrimitiveImpl::WrapIntoListArray(const int16_t* def_levels,
list_lengths.push_back(offset_builders[j]->length() - 1);
std::shared_ptr<Array> array;
RETURN_NOT_OK(offset_builders[j]->Finish(&array));
- offsets.emplace_back(std::static_pointer_cast<Int32Array>(array)->data());
+ offsets.emplace_back(std::static_pointer_cast<Int32Array>(array)->values());
RETURN_NOT_OK(valid_bits_builders[j]->Finish(&array));
- valid_bits.emplace_back(std::static_pointer_cast<BooleanArray>(array)->data());
+ valid_bits.emplace_back(std::static_pointer_cast<BooleanArray>(array)->values());
}
std::shared_ptr<Array> output(*array);
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/18b50448/src/parquet/arrow/test-util.h
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/test-util.h b/src/parquet/arrow/test-util.h
index 191a399..946afad 100644
--- a/src/parquet/arrow/test-util.h
+++ b/src/parquet/arrow/test-util.h
@@ -332,7 +332,7 @@ template <typename T>
void ExpectArray(T* expected, Array* result) {
auto p_array = static_cast<::arrow::PrimitiveArray*>(result);
for (int i = 0; i < result->length(); i++) {
- EXPECT_EQ(expected[i], reinterpret_cast<const T*>(p_array->data()->data())[i]);
+ EXPECT_EQ(expected[i], reinterpret_cast<const T*>(p_array->values()->data())[i]);
}
}
@@ -341,7 +341,8 @@ void ExpectArrayT(void* expected, Array* result) {
::arrow::PrimitiveArray* p_array = static_cast<::arrow::PrimitiveArray*>(result);
for (int64_t i = 0; i < result->length(); i++) {
EXPECT_EQ(reinterpret_cast<typename ArrowType::c_type*>(expected)[i],
- reinterpret_cast<const typename ArrowType::c_type*>(p_array->data()->data())[i]);
+ reinterpret_cast<const typename ArrowType::c_type*>(
+ p_array->values()->data())[i]);
}
}
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/18b50448/src/parquet/arrow/writer.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc
index a7aebac..104c040 100644
--- a/src/parquet/arrow/writer.cc
+++ b/src/parquet/arrow/writer.cc
@@ -61,30 +61,14 @@ class LevelBuilder {
Status VisitInline(const Array& array);
- Status Visit(const ::arrow::NullArray& array) {
- array_offsets_.push_back(static_cast<int32_t>(array.offset()));
- valid_bitmaps_.push_back(array.null_bitmap_data());
- null_counts_.push_back(array.length());
- values_type_ = array.type_id();
- values_array_ = &array;
- return Status::OK();
- }
-
- Status Visit(const ::arrow::PrimitiveArray& array) {
- array_offsets_.push_back(static_cast<int32_t>(array.offset()));
- valid_bitmaps_.push_back(array.null_bitmap_data());
- null_counts_.push_back(array.null_count());
- values_type_ = array.type_id();
- values_array_ = &array;
- return Status::OK();
- }
-
- Status Visit(const ::arrow::BinaryArray& array) {
+ template <typename T>
+ typename std::enable_if<std::is_base_of<::arrow::FlatArray, T>::value, Status>::type
+ Visit(const T& array) {
array_offsets_.push_back(static_cast<int32_t>(array.offset()));
valid_bitmaps_.push_back(array.null_bitmap_data());
null_counts_.push_back(array.null_count());
values_type_ = array.type_id();
- values_array_ = &array;
+ values_array_ = std::make_shared<T>(array.data());
return Status::OK();
}
@@ -115,7 +99,7 @@ class LevelBuilder {
Status GenerateLevels(const Array& array, const std::shared_ptr<Field>& field,
int64_t* values_offset, ::arrow::Type::type* values_type, int64_t* num_values,
int64_t* num_levels, std::shared_ptr<Buffer>* def_levels,
- std::shared_ptr<Buffer>* rep_levels, const Array** values_array) {
+ std::shared_ptr<Buffer>* rep_levels, std::shared_ptr<Array>* values_array) {
// Work downwards to extract bitmaps and offsets
min_offset_idx_ = 0;
max_offset_idx_ = static_cast<int32_t>(array.length());
@@ -173,11 +157,11 @@ class LevelBuilder {
std::shared_ptr<Array> def_levels_array;
RETURN_NOT_OK(def_levels_.Finish(&def_levels_array));
- *def_levels = static_cast<PrimitiveArray*>(def_levels_array.get())->data();
+ *def_levels = static_cast<PrimitiveArray*>(def_levels_array.get())->values();
std::shared_ptr<Array> rep_levels_array;
RETURN_NOT_OK(rep_levels_.Finish(&rep_levels_array));
- *rep_levels = static_cast<PrimitiveArray*>(rep_levels_array.get())->data();
+ *rep_levels = static_cast<PrimitiveArray*>(rep_levels_array.get())->values();
*num_levels = rep_levels_array->length();
}
@@ -248,7 +232,7 @@ class LevelBuilder {
int32_t min_offset_idx_;
int32_t max_offset_idx_;
::arrow::Type::type values_type_;
- const Array* values_array_;
+ std::shared_ptr<Array> values_array_;
};
Status LevelBuilder::VisitInline(const Array& array) {
@@ -311,7 +295,7 @@ Status FileWriter::Impl::TypedWriteBatch(ColumnWriter* column_writer,
using ArrowCType = typename ArrowType::c_type;
auto data = static_cast<const PrimitiveArray*>(array.get());
- auto data_ptr = reinterpret_cast<const ArrowCType*>(data->data()->data());
+ auto data_ptr = reinterpret_cast<const ArrowCType*>(data->values()->data());
auto writer = reinterpret_cast<TypedColumnWriter<ParquetType>*>(column_writer);
if (writer->descr()->schema_node()->is_required() || (data->null_count() == 0)) {
@@ -501,7 +485,7 @@ Status FileWriter::Impl::TypedWriteBatch<BooleanType, ::arrow::BooleanType>(
const int16_t* def_levels, const int16_t* rep_levels) {
RETURN_NOT_OK(data_buffer_.Resize(array->length()));
auto data = static_cast<const BooleanArray*>(array.get());
- auto data_ptr = reinterpret_cast<const uint8_t*>(data->data()->data());
+ auto data_ptr = reinterpret_cast<const uint8_t*>(data->values()->data());
auto buffer_ptr = reinterpret_cast<bool*>(data_buffer_.mutable_data());
auto writer = reinterpret_cast<TypedColumnWriter<BooleanType>*>(column_writer);
@@ -540,8 +524,8 @@ Status FileWriter::Impl::TypedWriteBatch<ByteArrayType, ::arrow::BinaryType>(
// data->data() points already to a nullptr, thus data->data()->data() will
// segfault.
const uint8_t* data_ptr = nullptr;
- if (data->data()) {
- data_ptr = reinterpret_cast<const uint8_t*>(data->data()->data());
+ if (data->value_data()) {
+ data_ptr = reinterpret_cast<const uint8_t*>(data->value_data()->data());
DCHECK(data_ptr != nullptr);
}
auto writer = reinterpret_cast<TypedColumnWriter<ByteArrayType>*>(column_writer);
@@ -620,14 +604,15 @@ Status FileWriter::Impl::WriteColumnChunk(const Array& data) {
std::shared_ptr<::arrow::Schema> arrow_schema;
RETURN_NOT_OK(FromParquetSchema(writer_->schema(), {current_column_idx - 1},
writer_->key_value_metadata(), &arrow_schema));
- LevelBuilder level_builder(pool_);
std::shared_ptr<Buffer> def_levels_buffer;
std::shared_ptr<Buffer> rep_levels_buffer;
int64_t values_offset;
::arrow::Type::type values_type;
int64_t num_levels;
int64_t num_values;
- const Array* _values_array;
+
+ std::shared_ptr<Array> _values_array;
+ LevelBuilder level_builder(pool_);
RETURN_NOT_OK(level_builder.GenerateLevels(data, arrow_schema->field(0), &values_offset,
&values_type, &num_values, &num_levels, &def_levels_buffer, &rep_levels_buffer,
&_values_array));
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/18b50448/src/parquet/file/writer.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/writer.cc b/src/parquet/file/writer.cc
index cafa6b8..d52c25c 100644
--- a/src/parquet/file/writer.cc
+++ b/src/parquet/file/writer.cc
@@ -53,7 +53,6 @@ int64_t RowGroupWriter::num_rows() const {
return contents_->num_rows();
}
-
// ----------------------------------------------------------------------
// ParquetFileWriter public API