You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by we...@apache.org on 2016/10/11 15:18:08 UTC
parquet-cpp git commit: PARQUET-747: Better hide
TypedRowGroupStatistics in public API
Repository: parquet-cpp
Updated Branches:
refs/heads/master a26ed0f17 -> 78ed6e8ce
PARQUET-747: Better hide TypedRowGroupStatistics in public API
EDIT: It appears the linker issue is coming from the use of `TypedRowGroupStatistics<T>::Update` in `WriteBatch`, and Update hasn't been exported. There's some gcc quirks around this (see https://codereview.chromium.org/643703003/), so it's easier to hide the WriteBatch implementation so that `libparquet_arrow` doesn't look for non-exported symbols during linking (since `WriteBatch` was being inlined).
Template subclasses don't behave in the same way with visibility attributes. See also PARQUET-659. Observed in build failure https://circleci.com/gh/conda-forge/staged-recipes/10100
Author: Wes McKinney <we...@apache.org>
Closes #178 from wesm/PARQUET-747 and squashes the following commits:
8e91f86 [Wes McKinney] Do not expose WriteBatch implementation in public headers
206f4e1 [Wes McKinney] clang format
ed5132d [Wes McKinney] Export single instantiations of TypedRowGroupStatistics with extern templates
Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/78ed6e8c
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/78ed6e8c
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/78ed6e8c
Branch: refs/heads/master
Commit: 78ed6e8cebfabd28567a6feaf82d736bce5316d2
Parents: a26ed0f
Author: Wes McKinney <we...@apache.org>
Authored: Tue Oct 11 11:18:01 2016 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Tue Oct 11 11:18:01 2016 -0400
----------------------------------------------------------------------
src/parquet/arrow/arrow-reader-writer-test.cc | 7 +-
src/parquet/column/column-writer-test.cc | 3 +-
src/parquet/column/scanner-test.cc | 3 +-
src/parquet/column/statistics.cc | 26 +------
src/parquet/column/statistics.h | 38 +++++++++-
src/parquet/column/writer.cc | 80 +++++++++++++++++++++
src/parquet/column/writer.h | 82 +---------------------
src/parquet/encodings/encoding-test.cc | 3 +-
src/parquet/file/file-serialize-test.cc | 3 +-
src/parquet/thrift/util.h | 6 +-
src/parquet/util/comparison.h | 2 +-
src/parquet/util/test-common.h | 3 +-
12 files changed, 137 insertions(+), 119 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/78ed6e8c/src/parquet/arrow/arrow-reader-writer-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc
index 11ebee0..b1f1c52 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -258,7 +258,8 @@ class TestParquetIO : public ::testing::Test {
typedef ::testing::Types<::arrow::BooleanType, ::arrow::UInt8Type, ::arrow::Int8Type,
::arrow::UInt16Type, ::arrow::Int16Type, ::arrow::Int32Type, ::arrow::UInt64Type,
::arrow::Int64Type, ::arrow::TimestampType, ::arrow::FloatType, ::arrow::DoubleType,
- ::arrow::StringType> TestTypes;
+ ::arrow::StringType>
+ TestTypes;
TYPED_TEST_CASE(TestParquetIO, TestTypes);
@@ -476,8 +477,8 @@ class TestPrimitiveParquetIO : public TestParquetIO<TestType> {
typedef ::testing::Types<::arrow::BooleanType, ::arrow::UInt8Type, ::arrow::Int8Type,
::arrow::UInt16Type, ::arrow::Int16Type, ::arrow::UInt32Type, ::arrow::Int32Type,
- ::arrow::UInt64Type, ::arrow::Int64Type, ::arrow::FloatType,
- ::arrow::DoubleType> PrimitiveTestTypes;
+ ::arrow::UInt64Type, ::arrow::Int64Type, ::arrow::FloatType, ::arrow::DoubleType>
+ PrimitiveTestTypes;
TYPED_TEST_CASE(TestPrimitiveParquetIO, PrimitiveTestTypes);
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/78ed6e8c/src/parquet/column/column-writer-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/column-writer-test.cc b/src/parquet/column/column-writer-test.cc
index 9f04c06..745efe7 100644
--- a/src/parquet/column/column-writer-test.cc
+++ b/src/parquet/column/column-writer-test.cc
@@ -214,7 +214,8 @@ void TestPrimitiveWriter<FLBAType>::ReadColumnFully(Compression::type compressio
}
typedef ::testing::Types<Int32Type, Int64Type, Int96Type, FloatType, DoubleType,
- BooleanType, ByteArrayType, FLBAType> TestTypes;
+ BooleanType, ByteArrayType, FLBAType>
+ TestTypes;
TYPED_TEST_CASE(TestPrimitiveWriter, TestTypes);
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/78ed6e8c/src/parquet/column/scanner-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/scanner-test.cc b/src/parquet/column/scanner-test.cc
index 3ac07dc..8eee191 100644
--- a/src/parquet/column/scanner-test.cc
+++ b/src/parquet/column/scanner-test.cc
@@ -146,7 +146,8 @@ static int num_pages = 20;
static int batch_size = 32;
typedef ::testing::Types<Int32Type, Int64Type, Int96Type, FloatType, DoubleType,
- ByteArrayType> TestTypes;
+ ByteArrayType>
+ TestTypes;
using TestBooleanFlatScanner = TestFlatScanner<BooleanType>;
using TestFLBAFlatScanner = TestFlatScanner<FLBAType>;
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/78ed6e8c/src/parquet/column/statistics.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/statistics.cc b/src/parquet/column/statistics.cc
index 65d7df5..d8f8785 100644
--- a/src/parquet/column/statistics.cc
+++ b/src/parquet/column/statistics.cc
@@ -20,10 +20,10 @@
#include "parquet/column/statistics.h"
#include "parquet/encodings/plain-encoding.h"
+#include "parquet/exception.h"
#include "parquet/util/buffer.h"
#include "parquet/util/comparison.h"
#include "parquet/util/output.h"
-#include "parquet/exception.h"
namespace parquet {
@@ -79,30 +79,6 @@ void TypedRowGroupStatistics<DType>::Reset() {
}
template <typename DType>
-void TypedRowGroupStatistics<DType>::Copy(const T& src, T* dst, OwnedMutableBuffer&) {
- *dst = src;
-}
-
-template <>
-void TypedRowGroupStatistics<FLBAType>::Copy(
- const FLBA& src, FLBA* dst, OwnedMutableBuffer& buffer) {
- if (dst->ptr == src.ptr) return;
- uint32_t len = descr_->type_length();
- buffer.Resize(len);
- std::memcpy(&buffer[0], src.ptr, len);
- *dst = FLBA(buffer.data());
-}
-
-template <>
-void TypedRowGroupStatistics<ByteArrayType>::Copy(
- const ByteArray& src, ByteArray* dst, OwnedMutableBuffer& buffer) {
- if (dst->ptr == src.ptr) return;
- buffer.Resize(src.len);
- std::memcpy(&buffer[0], src.ptr, src.len);
- *dst = ByteArray(src.len, buffer.data());
-}
-
-template <typename DType>
void TypedRowGroupStatistics<DType>::Update(
const T* values, int64_t num_not_null, int64_t num_null) {
DCHECK(num_not_null >= 0);
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/78ed6e8c/src/parquet/column/statistics.h
----------------------------------------------------------------------
diff --git a/src/parquet/column/statistics.h b/src/parquet/column/statistics.h
index 02e5abd..dc576c8 100644
--- a/src/parquet/column/statistics.h
+++ b/src/parquet/column/statistics.h
@@ -22,8 +22,8 @@
#include <memory>
#include <string>
-#include "parquet/types.h"
#include "parquet/schema/descriptor.h"
+#include "parquet/types.h"
#include "parquet/util/buffer.h"
#include "parquet/util/mem-allocator.h"
#include "parquet/util/visibility.h"
@@ -130,7 +130,7 @@ class PARQUET_EXPORT RowGroupStatistics
};
template <typename DType>
-class PARQUET_EXPORT TypedRowGroupStatistics : public RowGroupStatistics {
+class TypedRowGroupStatistics : public RowGroupStatistics {
public:
using T = typename DType::c_type;
@@ -170,6 +170,31 @@ class PARQUET_EXPORT TypedRowGroupStatistics : public RowGroupStatistics {
OwnedMutableBuffer min_buffer_, max_buffer_;
};
+template <typename DType>
+inline void TypedRowGroupStatistics<DType>::Copy(
+ const T& src, T* dst, OwnedMutableBuffer&) {
+ *dst = src;
+}
+
+template <>
+inline void TypedRowGroupStatistics<FLBAType>::Copy(
+ const FLBA& src, FLBA* dst, OwnedMutableBuffer& buffer) {
+ if (dst->ptr == src.ptr) return;
+ uint32_t len = descr_->type_length();
+ buffer.Resize(len);
+ std::memcpy(&buffer[0], src.ptr, len);
+ *dst = FLBA(buffer.data());
+}
+
+template <>
+inline void TypedRowGroupStatistics<ByteArrayType>::Copy(
+ const ByteArray& src, ByteArray* dst, OwnedMutableBuffer& buffer) {
+ if (dst->ptr == src.ptr) return;
+ buffer.Resize(src.len);
+ std::memcpy(&buffer[0], src.ptr, src.len);
+ *dst = ByteArray(src.len, buffer.data());
+}
+
using BoolStatistics = TypedRowGroupStatistics<BooleanType>;
using Int32Statistics = TypedRowGroupStatistics<Int32Type>;
using Int64Statistics = TypedRowGroupStatistics<Int64Type>;
@@ -179,6 +204,15 @@ using DoubleStatistics = TypedRowGroupStatistics<DoubleType>;
using ByteArrayStatistics = TypedRowGroupStatistics<ByteArrayType>;
using FLBAStatistics = TypedRowGroupStatistics<FLBAType>;
+extern template class TypedRowGroupStatistics<BooleanType>;
+extern template class TypedRowGroupStatistics<Int32Type>;
+extern template class TypedRowGroupStatistics<Int64Type>;
+extern template class TypedRowGroupStatistics<Int96Type>;
+extern template class TypedRowGroupStatistics<FloatType>;
+extern template class TypedRowGroupStatistics<DoubleType>;
+extern template class TypedRowGroupStatistics<ByteArrayType>;
+extern template class TypedRowGroupStatistics<FLBAType>;
+
} // namespace parquet
#endif // PARQUET_COLUMN_STATISTICS_H
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/78ed6e8c/src/parquet/column/writer.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/writer.cc b/src/parquet/column/writer.cc
index 5c10ab9..b917945 100644
--- a/src/parquet/column/writer.cc
+++ b/src/parquet/column/writer.cc
@@ -301,6 +301,86 @@ std::shared_ptr<ColumnWriter> ColumnWriter::Make(ColumnChunkMetaDataBuilder* met
// ----------------------------------------------------------------------
// Instantiate templated classes
+template <typename DType>
+inline int64_t TypedColumnWriter<DType>::WriteMiniBatch(int64_t num_values,
+ const int16_t* def_levels, const int16_t* rep_levels, const T* values) {
+ int64_t values_to_write = 0;
+ // If the field is required and non-repeated, there are no definition levels
+ if (descr_->max_definition_level() > 0) {
+ for (int64_t i = 0; i < num_values; ++i) {
+ if (def_levels[i] == descr_->max_definition_level()) { ++values_to_write; }
+ }
+
+ WriteDefinitionLevels(num_values, def_levels);
+ } else {
+ // Required field, write all values
+ values_to_write = num_values;
+ }
+
+ // Not present for non-repeated fields
+ if (descr_->max_repetition_level() > 0) {
+ // A row could include more than one value
+ // Count the occasions where we start a new row
+ for (int64_t i = 0; i < num_values; ++i) {
+ if (rep_levels[i] == 0) { num_rows_++; }
+ }
+
+ WriteRepetitionLevels(num_values, rep_levels);
+ } else {
+ // Each value is exactly one row
+ num_rows_ += num_values;
+ }
+
+ if (num_rows_ > expected_rows_) {
+ throw ParquetException("More rows were written in the column chunk than expected");
+ }
+
+ WriteValues(values_to_write, values);
+
+ if (page_statistics_ != nullptr) {
+ page_statistics_->Update(values, values_to_write, num_values - values_to_write);
+ }
+
+ num_buffered_values_ += num_values;
+ num_buffered_encoded_values_ += values_to_write;
+
+ if (current_encoder_->EstimatedDataEncodedSize() >= properties_->data_pagesize()) {
+ AddDataPage();
+ }
+ if (has_dictionary_ && !fallback_) { CheckDictionarySizeLimit(); }
+
+ return values_to_write;
+}
+
+template <typename DType>
+void TypedColumnWriter<DType>::WriteBatch(int64_t num_values,
+ const int16_t* def_levels, const int16_t* rep_levels, const T* values) {
+ // We check for DataPage limits only after we have inserted the values. If a user
+ // writes a large number of values, the DataPage size can be much above the limit.
+ // The purpose of this chunking is to bound this. Even if a user writes large number
+ // of values, the chunking will ensure the AddDataPage() is called at a reasonable
+ // pagesize limit
+ int64_t write_batch_size = properties_->write_batch_size();
+ int num_batches = num_values / write_batch_size;
+ int64_t num_remaining = num_values % write_batch_size;
+ int64_t value_offset = 0;
+ for (int round = 0; round < num_batches; round++) {
+ int64_t offset = round * write_batch_size;
+ int64_t num_values = WriteMiniBatch(write_batch_size, &def_levels[offset],
+ &rep_levels[offset], &values[value_offset]);
+ value_offset += num_values;
+ }
+ // Write the remaining values
+ int64_t offset = num_batches * write_batch_size;
+ WriteMiniBatch(
+ num_remaining, &def_levels[offset], &rep_levels[offset], &values[value_offset]);
+}
+
+template <typename DType>
+void TypedColumnWriter<DType>::WriteValues(int64_t num_values, const T* values) {
+ current_encoder_->Put(values, num_values);
+}
+
template class TypedColumnWriter<BooleanType>;
template class TypedColumnWriter<Int32Type>;
template class TypedColumnWriter<Int64Type>;
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/78ed6e8c/src/parquet/column/writer.h
----------------------------------------------------------------------
diff --git a/src/parquet/column/writer.h b/src/parquet/column/writer.h
index ab01818..67a29bc 100644
--- a/src/parquet/column/writer.h
+++ b/src/parquet/column/writer.h
@@ -24,8 +24,8 @@
#include "parquet/column/page.h"
#include "parquet/column/properties.h"
#include "parquet/column/statistics.h"
-#include "parquet/file/metadata.h"
#include "parquet/encodings/encoder.h"
+#include "parquet/file/metadata.h"
#include "parquet/schema/descriptor.h"
#include "parquet/types.h"
#include "parquet/util/mem-allocator.h"
@@ -186,86 +186,6 @@ class PARQUET_EXPORT TypedColumnWriter : public ColumnWriter {
std::unique_ptr<TypedStats> chunk_statistics_;
};
-template <typename DType>
-inline int64_t TypedColumnWriter<DType>::WriteMiniBatch(int64_t num_values,
- const int16_t* def_levels, const int16_t* rep_levels, const T* values) {
- int64_t values_to_write = 0;
- // If the field is required and non-repeated, there are no definition levels
- if (descr_->max_definition_level() > 0) {
- for (int64_t i = 0; i < num_values; ++i) {
- if (def_levels[i] == descr_->max_definition_level()) { ++values_to_write; }
- }
-
- WriteDefinitionLevels(num_values, def_levels);
- } else {
- // Required field, write all values
- values_to_write = num_values;
- }
-
- // Not present for non-repeated fields
- if (descr_->max_repetition_level() > 0) {
- // A row could include more than one value
- // Count the occasions where we start a new row
- for (int64_t i = 0; i < num_values; ++i) {
- if (rep_levels[i] == 0) { num_rows_++; }
- }
-
- WriteRepetitionLevels(num_values, rep_levels);
- } else {
- // Each value is exactly one row
- num_rows_ += num_values;
- }
-
- if (num_rows_ > expected_rows_) {
- throw ParquetException("More rows were written in the column chunk than expected");
- }
-
- WriteValues(values_to_write, values);
-
- if (page_statistics_ != nullptr) {
- page_statistics_->Update(values, values_to_write, num_values - values_to_write);
- }
-
- num_buffered_values_ += num_values;
- num_buffered_encoded_values_ += values_to_write;
-
- if (current_encoder_->EstimatedDataEncodedSize() >= properties_->data_pagesize()) {
- AddDataPage();
- }
- if (has_dictionary_ && !fallback_) { CheckDictionarySizeLimit(); }
-
- return values_to_write;
-}
-
-template <typename DType>
-inline void TypedColumnWriter<DType>::WriteBatch(int64_t num_values,
- const int16_t* def_levels, const int16_t* rep_levels, const T* values) {
- // We check for DataPage limits only after we have inserted the values. If a user
- // writes a large number of values, the DataPage size can be much above the limit.
- // The purpose of this chunking is to bound this. Even if a user writes large number
- // of values, the chunking will ensure the AddDataPage() is called at a reasonable
- // pagesize limit
- int64_t write_batch_size = properties_->write_batch_size();
- int num_batches = num_values / write_batch_size;
- int64_t num_remaining = num_values % write_batch_size;
- int64_t value_offset = 0;
- for (int round = 0; round < num_batches; round++) {
- int64_t offset = round * write_batch_size;
- int64_t num_values = WriteMiniBatch(write_batch_size, &def_levels[offset],
- &rep_levels[offset], &values[value_offset]);
- value_offset += num_values;
- }
- // Write the remaining values
- int64_t offset = num_batches * write_batch_size;
- WriteMiniBatch(
- num_remaining, &def_levels[offset], &rep_levels[offset], &values[value_offset]);
-}
-
-template <typename DType>
-void TypedColumnWriter<DType>::WriteValues(int64_t num_values, const T* values) {
- current_encoder_->Put(values, num_values);
-}
-
typedef TypedColumnWriter<BooleanType> BoolWriter;
typedef TypedColumnWriter<Int32Type> Int32Writer;
typedef TypedColumnWriter<Int64Type> Int64Writer;
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/78ed6e8c/src/parquet/encodings/encoding-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/encodings/encoding-test.cc b/src/parquet/encodings/encoding-test.cc
index 6af7c9d..daa25cb 100644
--- a/src/parquet/encodings/encoding-test.cc
+++ b/src/parquet/encodings/encoding-test.cc
@@ -238,7 +238,8 @@ TYPED_TEST(TestPlainEncoding, BasicRoundTrip) {
// Dictionary encoding tests
typedef ::testing::Types<Int32Type, Int64Type, Int96Type, FloatType, DoubleType,
- ByteArrayType, FLBAType> DictEncodedTypes;
+ ByteArrayType, FLBAType>
+ DictEncodedTypes;
template <typename Type>
class TestDictionaryEncoding : public TestEncodingBase<Type> {
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/78ed6e8c/src/parquet/file/file-serialize-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/file/file-serialize-test.cc b/src/parquet/file/file-serialize-test.cc
index 5980e55..42a73c9 100644
--- a/src/parquet/file/file-serialize-test.cc
+++ b/src/parquet/file/file-serialize-test.cc
@@ -106,7 +106,8 @@ class TestSerialize : public PrimitiveTypedTest<TestType> {
};
typedef ::testing::Types<Int32Type, Int64Type, Int96Type, FloatType, DoubleType,
- BooleanType, ByteArrayType, FLBAType> TestTypes;
+ BooleanType, ByteArrayType, FLBAType>
+ TestTypes;
TYPED_TEST_CASE(TestSerialize, TestTypes);
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/78ed6e8c/src/parquet/thrift/util.h
----------------------------------------------------------------------
diff --git a/src/parquet/thrift/util.h b/src/parquet/thrift/util.h
index f629689..a340c6c 100644
--- a/src/parquet/thrift/util.h
+++ b/src/parquet/thrift/util.h
@@ -82,7 +82,8 @@ inline void DeserializeThriftMsg(const uint8_t* buf, uint32_t* len, T* deseriali
boost::shared_ptr<apache::thrift::transport::TMemoryBuffer> tmem_transport(
new apache::thrift::transport::TMemoryBuffer(const_cast<uint8_t*>(buf), *len));
apache::thrift::protocol::TCompactProtocolFactoryT<
- apache::thrift::transport::TMemoryBuffer> tproto_factory;
+ apache::thrift::transport::TMemoryBuffer>
+ tproto_factory;
boost::shared_ptr<apache::thrift::protocol::TProtocol> tproto =
tproto_factory.getProtocol(tmem_transport);
try {
@@ -104,7 +105,8 @@ inline void SerializeThriftMsg(T* obj, uint32_t len, OutputStream* out) {
boost::shared_ptr<apache::thrift::transport::TMemoryBuffer> mem_buffer(
new apache::thrift::transport::TMemoryBuffer(len));
apache::thrift::protocol::TCompactProtocolFactoryT<
- apache::thrift::transport::TMemoryBuffer> tproto_factory;
+ apache::thrift::transport::TMemoryBuffer>
+ tproto_factory;
boost::shared_ptr<apache::thrift::protocol::TProtocol> tproto =
tproto_factory.getProtocol(mem_buffer);
try {
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/78ed6e8c/src/parquet/util/comparison.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/comparison.h b/src/parquet/util/comparison.h
index 9d44e7e..5ca7520 100644
--- a/src/parquet/util/comparison.h
+++ b/src/parquet/util/comparison.h
@@ -20,8 +20,8 @@
#include <algorithm>
-#include "parquet/types.h"
#include "parquet/schema/descriptor.h"
+#include "parquet/types.h"
namespace parquet {
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/78ed6e8c/src/parquet/util/test-common.h
----------------------------------------------------------------------
diff --git a/src/parquet/util/test-common.h b/src/parquet/util/test-common.h
index edadb53..2327aeb 100644
--- a/src/parquet/util/test-common.h
+++ b/src/parquet/util/test-common.h
@@ -32,7 +32,8 @@ namespace parquet {
namespace test {
typedef ::testing::Types<BooleanType, Int32Type, Int64Type, Int96Type, FloatType,
- DoubleType, ByteArrayType, FLBAType> ParquetTypes;
+ DoubleType, ByteArrayType, FLBAType>
+ ParquetTypes;
template <typename T>
static inline void assert_vector_equal(const vector<T>& left, const vector<T>& right) {