You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by we...@apache.org on 2016/06/03 07:35:30 UTC
parquet-cpp git commit: PARQUET-598: Test writing all primitive data
types
Repository: parquet-cpp
Updated Branches:
refs/heads/master 22b4977c1 -> bd887e2ca
PARQUET-598: Test writing all primitive data types
Author: Uwe L. Korn <uw...@xhochy.com>
Closes #113 from xhochy/parquet-598 and squashes the following commits:
a22d7b7 [Uwe L. Korn] Address review comments
a82159b [Uwe L. Korn] Move specialization of InitValues to separate header
e2b48ea [Uwe L. Korn] Move specialization of InitValues to separate header
c3a6790 [Uwe L. Korn] PARQUET-598: Test writing all primitive data types
Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/bd887e2c
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/bd887e2c
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/bd887e2c
Branch: refs/heads/master
Commit: bd887e2caacbe9779696c9e6515d36d1b16a792d
Parents: 22b4977
Author: Uwe L. Korn <uw...@xhochy.com>
Authored: Fri Jun 3 00:35:22 2016 -0700
Committer: Wes McKinney <we...@apache.org>
Committed: Fri Jun 3 00:35:22 2016 -0700
----------------------------------------------------------------------
src/parquet/column/column-writer-test.cc | 223 +++++++++++++++++---------
src/parquet/column/scanner-test.cc | 35 +---
src/parquet/column/test-specialization.h | 66 ++++++++
src/parquet/column/test-util.h | 6 +
4 files changed, 224 insertions(+), 106 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/bd887e2c/src/parquet/column/column-writer-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/column-writer-test.cc b/src/parquet/column/column-writer-test.cc
index 653572a..5d89c1a 100644
--- a/src/parquet/column/column-writer-test.cc
+++ b/src/parquet/column/column-writer-test.cc
@@ -17,6 +17,9 @@
#include <gtest/gtest.h>
+#include "parquet/column/test-util.h"
+#include "parquet/column/test-specialization.h"
+
#include "parquet/file/reader-internal.h"
#include "parquet/file/writer-internal.h"
#include "parquet/column/reader.h"
@@ -32,59 +35,88 @@ using schema::PrimitiveNode;
namespace test {
+// The default size used in most tests.
+const int SMALL_SIZE = 100;
+// Larger size to test some corner cases, only used in some specific cases.
+const int LARGE_SIZE = 10000;
+
+template <typename TestType>
class TestPrimitiveWriter : public ::testing::Test {
public:
+ typedef typename TestType::c_type T;
+
void SetUpSchemaRequired() {
- node_ = PrimitiveNode::Make("int64", Repetition::REQUIRED, Type::INT64);
+ node_ = PrimitiveNode::Make("column", Repetition::REQUIRED, TestType::type_num,
+ LogicalType::NONE, FLBA_LENGTH);
schema_ = std::make_shared<ColumnDescriptor>(node_, 0, 0);
}
void SetUpSchemaOptional() {
- node_ = PrimitiveNode::Make("int64", Repetition::OPTIONAL, Type::INT64);
+ node_ = PrimitiveNode::Make("column", Repetition::OPTIONAL, TestType::type_num,
+ LogicalType::NONE, FLBA_LENGTH);
schema_ = std::make_shared<ColumnDescriptor>(node_, 1, 0);
}
void SetUpSchemaRepeated() {
- node_ = PrimitiveNode::Make("int64", Repetition::REPEATED, Type::INT64);
+ node_ = PrimitiveNode::Make("column", Repetition::REPEATED, TestType::type_num,
+ LogicalType::NONE, FLBA_LENGTH);
schema_ = std::make_shared<ColumnDescriptor>(node_, 1, 1);
}
+ void GenerateData(int64_t num_values);
+
+ void SetupValuesOut();
+
void SetUp() {
- values_out_.resize(100);
- definition_levels_out_.resize(100);
- repetition_levels_out_.resize(100);
+ SetupValuesOut();
+ definition_levels_out_.resize(SMALL_SIZE);
+ repetition_levels_out_.resize(SMALL_SIZE);
SetUpSchemaRequired();
}
- std::unique_ptr<Int64Reader> BuildReader() {
+ void BuildReader() {
auto buffer = sink_->GetBuffer();
std::unique_ptr<InMemoryInputStream> source(new InMemoryInputStream(buffer));
std::unique_ptr<SerializedPageReader> page_reader(
new SerializedPageReader(std::move(source), Compression::UNCOMPRESSED));
- return std::unique_ptr<Int64Reader>(
- new Int64Reader(schema_.get(), std::move(page_reader)));
+ reader_.reset(new TypedColumnReader<TestType>(schema_.get(), std::move(page_reader)));
}
- std::unique_ptr<Int64Writer> BuildWriter(int64_t output_size = 100) {
+ std::unique_ptr<TypedColumnWriter<TestType>> BuildWriter(
+ int64_t output_size = SMALL_SIZE) {
sink_.reset(new InMemoryOutputStream());
std::unique_ptr<SerializedPageWriter> pager(
new SerializedPageWriter(sink_.get(), Compression::UNCOMPRESSED, &metadata_));
- return std::unique_ptr<Int64Writer>(
- new Int64Writer(schema_.get(), std::move(pager), output_size));
+ return std::unique_ptr<TypedColumnWriter<TestType>>(
+ new TypedColumnWriter<TestType>(schema_.get(), std::move(pager), output_size));
}
+ void SyncValuesOut();
void ReadColumn() {
- auto reader = BuildReader();
- reader->ReadBatch(values_out_.size(), definition_levels_out_.data(),
- repetition_levels_out_.data(), values_out_.data(), &values_read_);
+ BuildReader();
+ reader_->ReadBatch(values_out_.size(), definition_levels_out_.data(),
+ repetition_levels_out_.data(), values_out_ptr_, &values_read_);
+ SyncValuesOut();
}
protected:
int64_t values_read_;
+ // Keep the reader alive as for ByteArray the lifetime of the ByteArray
+ // content is bound to the reader.
+ std::unique_ptr<TypedColumnReader<TestType>> reader_;
+
+ // Input buffers
+ std::vector<T> values_;
+ std::vector<uint8_t> buffer_;
+ // Pointer to the values, needed as we cannot use vector<bool>::data()
+ T* values_ptr_;
+ std::vector<uint8_t> bool_buffer_;
// Output buffers
- std::vector<int64_t> values_out_;
+ std::vector<T> values_out_;
+ std::vector<uint8_t> bool_buffer_out_;
+ T* values_out_ptr_;
std::vector<int16_t> definition_levels_out_;
std::vector<int16_t> repetition_levels_out_;
@@ -95,105 +127,152 @@ class TestPrimitiveWriter : public ::testing::Test {
std::unique_ptr<InMemoryOutputStream> sink_;
};
-TEST_F(TestPrimitiveWriter, RequiredNonRepeated) {
- std::vector<int64_t> values(100, 128);
+template <typename TestType>
+void TestPrimitiveWriter<TestType>::SetupValuesOut() {
+ values_out_.resize(SMALL_SIZE);
+ values_out_ptr_ = values_out_.data();
+}
+
+template <>
+void TestPrimitiveWriter<BooleanType>::SetupValuesOut() {
+ values_out_.resize(SMALL_SIZE);
+ bool_buffer_out_.resize(SMALL_SIZE);
+ // Write once to all values so we can copy it without getting Valgrind errors
+ // about uninitialised values.
+ std::fill(bool_buffer_out_.begin(), bool_buffer_out_.end(), true);
+ values_out_ptr_ = reinterpret_cast<bool*>(bool_buffer_out_.data());
+}
+
+template <typename TestType>
+void TestPrimitiveWriter<TestType>::SyncValuesOut() {}
+
+template <>
+void TestPrimitiveWriter<BooleanType>::SyncValuesOut() {
+ std::copy(bool_buffer_out_.begin(), bool_buffer_out_.end(), values_out_.begin());
+}
+
+template <typename TestType>
+void TestPrimitiveWriter<TestType>::GenerateData(int64_t num_values) {
+ values_.resize(num_values);
+ InitValues<T>(num_values, values_, buffer_);
+ values_ptr_ = values_.data();
+}
+
+template <>
+void TestPrimitiveWriter<BooleanType>::GenerateData(int64_t num_values) {
+ values_.resize(num_values);
+ InitValues<T>(num_values, values_, buffer_);
+ bool_buffer_.resize(num_values);
+ std::copy(values_.begin(), values_.end(), bool_buffer_.begin());
+ values_ptr_ = reinterpret_cast<bool*>(bool_buffer_.data());
+}
+
+typedef ::testing::Types<Int32Type, Int64Type, Int96Type, FloatType, DoubleType,
+ BooleanType, ByteArrayType, FLBAType> TestTypes;
+
+TYPED_TEST_CASE(TestPrimitiveWriter, TestTypes);
+
+TYPED_TEST(TestPrimitiveWriter, Required) {
+ this->GenerateData(SMALL_SIZE);
// Test case 1: required and non-repeated, so no definition or repetition levels
- std::unique_ptr<Int64Writer> writer = BuildWriter();
- writer->WriteBatch(values.size(), nullptr, nullptr, values.data());
+ std::unique_ptr<TypedColumnWriter<TypeParam>> writer = this->BuildWriter();
+ writer->WriteBatch(this->values_.size(), nullptr, nullptr, this->values_ptr_);
writer->Close();
- ReadColumn();
- ASSERT_EQ(100, values_read_);
- ASSERT_EQ(values, values_out_);
+ this->ReadColumn();
+ ASSERT_EQ(SMALL_SIZE, this->values_read_);
+ ASSERT_EQ(this->values_, this->values_out_);
}
-TEST_F(TestPrimitiveWriter, OptionalNonRepeated) {
+TYPED_TEST(TestPrimitiveWriter, Optional) {
// Optional and non-repeated, with definition levels
// but no repetition levels
- SetUpSchemaOptional();
+ this->SetUpSchemaOptional();
- std::vector<int64_t> values(100, 128);
- std::vector<int16_t> definition_levels(100, 1);
+ this->GenerateData(SMALL_SIZE);
+ std::vector<int16_t> definition_levels(SMALL_SIZE, 1);
definition_levels[1] = 0;
- auto writer = BuildWriter();
- writer->WriteBatch(values.size(), definition_levels.data(), nullptr, values.data());
+ auto writer = this->BuildWriter();
+ writer->WriteBatch(
+ this->values_.size(), definition_levels.data(), nullptr, this->values_ptr_);
writer->Close();
- ReadColumn();
- ASSERT_EQ(99, values_read_);
- values_out_.resize(99);
- values.resize(99);
- ASSERT_EQ(values, values_out_);
+ this->ReadColumn();
+ ASSERT_EQ(99, this->values_read_);
+ this->values_out_.resize(99);
+ this->values_.resize(99);
+ ASSERT_EQ(this->values_, this->values_out_);
}
-TEST_F(TestPrimitiveWriter, OptionalRepeated) {
+TYPED_TEST(TestPrimitiveWriter, Repeated) {
// Optional and repeated, so definition and repetition levels
- SetUpSchemaRepeated();
+ this->SetUpSchemaRepeated();
- std::vector<int64_t> values(100, 128);
- std::vector<int16_t> definition_levels(100, 1);
+ this->GenerateData(SMALL_SIZE);
+ std::vector<int16_t> definition_levels(SMALL_SIZE, 1);
definition_levels[1] = 0;
- std::vector<int16_t> repetition_levels(100, 0);
+ std::vector<int16_t> repetition_levels(SMALL_SIZE, 0);
- auto writer = BuildWriter();
- writer->WriteBatch(
- values.size(), definition_levels.data(), repetition_levels.data(), values.data());
+ auto writer = this->BuildWriter();
+ writer->WriteBatch(this->values_.size(), definition_levels.data(),
+ repetition_levels.data(), this->values_ptr_);
writer->Close();
- ReadColumn();
- ASSERT_EQ(99, values_read_);
- values_out_.resize(99);
- values.resize(99);
- ASSERT_EQ(values, values_out_);
+ this->ReadColumn();
+ ASSERT_EQ(SMALL_SIZE - 1, this->values_read_);
+ this->values_out_.resize(SMALL_SIZE - 1);
+ this->values_.resize(SMALL_SIZE - 1);
+ ASSERT_EQ(this->values_, this->values_out_);
}
-TEST_F(TestPrimitiveWriter, RequiredTooFewRows) {
- std::vector<int64_t> values(99, 128);
+TYPED_TEST(TestPrimitiveWriter, RequiredTooFewRows) {
+ this->GenerateData(SMALL_SIZE - 1);
- auto writer = BuildWriter();
- writer->WriteBatch(values.size(), nullptr, nullptr, values.data());
+ auto writer = this->BuildWriter();
+ writer->WriteBatch(this->values_.size(), nullptr, nullptr, this->values_ptr_);
ASSERT_THROW(writer->Close(), ParquetException);
}
-TEST_F(TestPrimitiveWriter, RequiredTooMany) {
- std::vector<int64_t> values(200, 128);
+TYPED_TEST(TestPrimitiveWriter, RequiredTooMany) {
+ this->GenerateData(2 * SMALL_SIZE);
- auto writer = BuildWriter();
- ASSERT_THROW(writer->WriteBatch(values.size(), nullptr, nullptr, values.data()),
+ auto writer = this->BuildWriter();
+ ASSERT_THROW(
+ writer->WriteBatch(this->values_.size(), nullptr, nullptr, this->values_ptr_),
ParquetException);
}
-TEST_F(TestPrimitiveWriter, OptionalRepeatedTooFewRows) {
+TYPED_TEST(TestPrimitiveWriter, RepeatedTooFewRows) {
// Optional and repeated, so definition and repetition levels
- SetUpSchemaRepeated();
+ this->SetUpSchemaRepeated();
- std::vector<int64_t> values(100, 128);
- std::vector<int16_t> definition_levels(100, 1);
+ this->GenerateData(SMALL_SIZE);
+ std::vector<int16_t> definition_levels(SMALL_SIZE, 1);
definition_levels[1] = 0;
- std::vector<int16_t> repetition_levels(100, 0);
+ std::vector<int16_t> repetition_levels(SMALL_SIZE, 0);
repetition_levels[3] = 1;
- auto writer = BuildWriter();
- writer->WriteBatch(
- values.size(), definition_levels.data(), repetition_levels.data(), values.data());
+ auto writer = this->BuildWriter();
+ writer->WriteBatch(this->values_.size(), definition_levels.data(),
+ repetition_levels.data(), this->values_ptr_);
ASSERT_THROW(writer->Close(), ParquetException);
}
-TEST_F(TestPrimitiveWriter, RequiredNonRepeatedLargeChunk) {
- std::vector<int64_t> values(10000, 128);
+TYPED_TEST(TestPrimitiveWriter, RequiredLargeChunk) {
+ this->GenerateData(LARGE_SIZE);
// Test case 1: required and non-repeated, so no definition or repetition levels
- std::unique_ptr<Int64Writer> writer = BuildWriter(10000);
- writer->WriteBatch(values.size(), nullptr, nullptr, values.data());
+ auto writer = this->BuildWriter(LARGE_SIZE);
+ writer->WriteBatch(this->values_.size(), nullptr, nullptr, this->values_ptr_);
writer->Close();
- // Just read the first 100 to ensure we could read it back in
- ReadColumn();
- ASSERT_EQ(100, values_read_);
- values.resize(100);
- ASSERT_EQ(values, values_out_);
+ // Just read the first SMALL_SIZE rows to ensure we could read it back in
+ this->ReadColumn();
+ ASSERT_EQ(SMALL_SIZE, this->values_read_);
+ this->values_.resize(SMALL_SIZE);
+ ASSERT_EQ(this->values_, this->values_out_);
}
} // namespace test
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/bd887e2c/src/parquet/column/scanner-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/scanner-test.cc b/src/parquet/column/scanner-test.cc
index 1e3ce74..78cd16c 100644
--- a/src/parquet/column/scanner-test.cc
+++ b/src/parquet/column/scanner-test.cc
@@ -31,6 +31,7 @@
#include "parquet/schema/descriptor.h"
#include "parquet/schema/types.h"
#include "parquet/util/test-common.h"
+#include "parquet/column/test-specialization.h"
using std::string;
using std::vector;
@@ -40,43 +41,9 @@ namespace parquet {
using schema::NodePtr;
-static int FLBA_LENGTH = 12;
-
-bool operator==(const FixedLenByteArray& a, const FixedLenByteArray& b) {
- return 0 == memcmp(a.ptr, b.ptr, FLBA_LENGTH);
-}
-
namespace test {
template <>
-void InitValues<bool>(int num_values, vector<bool>& values, vector<uint8_t>& buffer) {
- values = flip_coins(num_values, 0);
-}
-
-template <>
-void InitValues<Int96>(int num_values, vector<Int96>& values, vector<uint8_t>& buffer) {
- random_Int96_numbers(num_values, 0, std::numeric_limits<int32_t>::min(),
- std::numeric_limits<int32_t>::max(), values.data());
-}
-
-template <>
-void InitValues<ByteArray>(
- int num_values, vector<ByteArray>& values, vector<uint8_t>& buffer) {
- int max_byte_array_len = 12;
- int num_bytes = max_byte_array_len + sizeof(uint32_t);
- size_t nbytes = num_values * num_bytes;
- buffer.resize(nbytes);
- random_byte_array(num_values, 0, buffer.data(), values.data(), max_byte_array_len);
-}
-
-template <>
-void InitValues<FLBA>(int num_values, vector<FLBA>& values, vector<uint8_t>& buffer) {
- size_t nbytes = num_values * FLBA_LENGTH;
- buffer.resize(nbytes);
- random_fixed_byte_array(num_values, 0, buffer.data(), FLBA_LENGTH, values.data());
-}
-
-template <>
void InitDictValues<bool>(
int num_values, int dict_per_page, vector<bool>& values, vector<uint8_t>& buffer) {
// No op for bool
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/bd887e2c/src/parquet/column/test-specialization.h
----------------------------------------------------------------------
diff --git a/src/parquet/column/test-specialization.h b/src/parquet/column/test-specialization.h
new file mode 100644
index 0000000..ab678b8
--- /dev/null
+++ b/src/parquet/column/test-specialization.h
@@ -0,0 +1,66 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// This module defines an abstract interface for iterating through pages in a
+// Parquet column chunk within a row group. It could be extended in the future
+// to iterate through all data pages in all chunks in a file.
+
+#ifndef PARQUET_COLUMN_TEST_SPECIALIZATION_H
+#define PARQUET_COLUMN_TEST_SPECIALIZATION_H
+
+#include <limits>
+#include <vector>
+
+#include "parquet/column/test-util.h"
+
+namespace parquet {
+
+namespace test {
+
+template <>
+void InitValues<bool>(int num_values, vector<bool>& values, vector<uint8_t>& buffer) {
+ values = flip_coins(num_values, 0);
+}
+
+template <>
+void InitValues<ByteArray>(
+ int num_values, vector<ByteArray>& values, vector<uint8_t>& buffer) {
+ int max_byte_array_len = 12;
+ int num_bytes = max_byte_array_len + sizeof(uint32_t);
+ size_t nbytes = num_values * num_bytes;
+ buffer.resize(nbytes);
+ random_byte_array(num_values, 0, buffer.data(), values.data(), max_byte_array_len);
+}
+
+template <>
+void InitValues<FLBA>(int num_values, vector<FLBA>& values, vector<uint8_t>& buffer) {
+ size_t nbytes = num_values * FLBA_LENGTH;
+ buffer.resize(nbytes);
+ random_fixed_byte_array(num_values, 0, buffer.data(), FLBA_LENGTH, values.data());
+}
+
+template <>
+void InitValues<Int96>(int num_values, vector<Int96>& values, vector<uint8_t>& buffer) {
+ random_Int96_numbers(num_values, 0, std::numeric_limits<int32_t>::min(),
+ std::numeric_limits<int32_t>::max(), values.data());
+}
+
+} // namespace test
+
+} // namespace parquet
+
+#endif // PARQUET_COLUMN_TEST_SPECIALIZATION_H
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/bd887e2c/src/parquet/column/test-util.h
----------------------------------------------------------------------
diff --git a/src/parquet/column/test-util.h b/src/parquet/column/test-util.h
index d13f052..ba24fb2 100644
--- a/src/parquet/column/test-util.h
+++ b/src/parquet/column/test-util.h
@@ -42,6 +42,12 @@ using std::shared_ptr;
namespace parquet {
+static int FLBA_LENGTH = 12;
+
+bool operator==(const FixedLenByteArray& a, const FixedLenByteArray& b) {
+ return 0 == memcmp(a.ptr, b.ptr, FLBA_LENGTH);
+}
+
namespace test {
template <typename T>