You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by ju...@apache.org on 2016/02/29 19:04:10 UTC
parquet-cpp git commit: PARQUET-545: Improve API to support decimal
type
Repository: parquet-cpp
Updated Branches:
refs/heads/master c6e069297 -> ebb45b1e7
PARQUET-545: Improve API to support decimal type
This PR also
1) Improves scanner coverage.
2) Implements PARQUET-546
3) Adds tests for Types to string
Author: Deepak Majeti <de...@hpe.com>
Closes #65 from majetideepak/PARQUET-545 and squashes the following commits:
df28ccc [Deepak Majeti] removed little redundant code
c5f6d85 [Deepak Majeti] improved schema type coverage
6a18c98 [Deepak Majeti] removed ColumnDescriptor Repetition API
6b450ab [Deepak Majeti] modified scanner test to remove redundant code
1eed0de [Deepak Majeti] Added tests for GroupNodes
0f0e559 [Deepak Majeti] added type to string tests
cab06d8 [Deepak Majeti] Improve coverage for types test
6ce10ce [Deepak Majeti] Tests for PARQUET-545
a643bf4 [Deepak Majeti] Added PrimitiveNode Tests
656942f [Deepak Majeti] restored some necessary checks
addf7f6 [Deepak Majeti] remove unnecessary checks
8a043c0 [Deepak Majeti] modified scanner and column descriptor
a346ca5 [Deepak Majeti] PARQUET-546
bdbcdd7 [Deepak Majeti] Extend Column Descriptor to include Decimal scale and precision
938ded5 [Deepak Majeti] Improve Scanner Coverage
Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/ebb45b1e
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/ebb45b1e
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/ebb45b1e
Branch: refs/heads/master
Commit: ebb45b1e7da49e4084602b10aee90dcdf7317768
Parents: c6e0692
Author: Deepak Majeti <de...@hpe.com>
Authored: Mon Feb 29 10:04:00 2016 -0800
Committer: Julien Le Dem <ju...@dremio.com>
Committed: Mon Feb 29 10:04:00 2016 -0800
----------------------------------------------------------------------
src/parquet/CMakeLists.txt | 1 +
src/parquet/column/scanner-test.cc | 91 +++++++++++-----
src/parquet/column/scanner.h | 18 ++--
src/parquet/encodings/encoding-test.cc | 4 +-
src/parquet/schema/descriptor.cc | 11 +-
src/parquet/schema/descriptor.h | 19 ++--
src/parquet/schema/schema-descriptor-test.cc | 6 +-
src/parquet/schema/schema-types-test.cc | 69 ++++++++++--
src/parquet/schema/types.cc | 126 +++++++++++++++++++---
src/parquet/schema/types.h | 23 ++--
src/parquet/types-test.cc | 82 ++++++++++++++
src/parquet/types.h | 67 ++++++++++++
12 files changed, 419 insertions(+), 98 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/src/parquet/CMakeLists.txt b/src/parquet/CMakeLists.txt
index 97547ce..8a9b860 100644
--- a/src/parquet/CMakeLists.txt
+++ b/src/parquet/CMakeLists.txt
@@ -23,4 +23,5 @@ install(FILES
DESTINATION include/parquet)
ADD_PARQUET_TEST(public-api-test)
+ADD_PARQUET_TEST(types-test)
ADD_PARQUET_TEST(reader-test)
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/column/scanner-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/scanner-test.cc b/src/parquet/column/scanner-test.cc
index 785db08..fcaf65e 100644
--- a/src/parquet/column/scanner-test.cc
+++ b/src/parquet/column/scanner-test.cc
@@ -103,7 +103,7 @@ class TestFlatScanner : public ::testing::Test {
TypedScanner<Type::type_num>* scanner =
reinterpret_cast<TypedScanner<Type::type_num>* >(scanner_.get());
T val;
- bool is_null;
+ bool is_null = false;
int16_t def_level;
int16_t rep_level;
size_t j = 0;
@@ -113,15 +113,15 @@ class TestFlatScanner : public ::testing::Test {
if (!is_null) {
ASSERT_EQ(values_[j++], val) << i <<"V"<< j;
}
- if (!d->is_required()) {
+ if (d->max_definition_level() > 0) {
ASSERT_EQ(def_levels_[i], def_level) << i <<"D"<< j;
}
- if (d->is_repeated()) {
+ if (d->max_repetition_level() > 0) {
ASSERT_EQ(rep_levels_[i], rep_level) << i <<"R"<< j;
}
}
ASSERT_EQ(num_values_, j);
- ASSERT_FALSE(scanner->HasNext());
+ ASSERT_FALSE(scanner->Next(&val, &def_level, &rep_level, &is_null));
}
void Clear() {
@@ -140,21 +140,25 @@ class TestFlatScanner : public ::testing::Test {
}
void InitDescriptors(std::shared_ptr<ColumnDescriptor>& d1,
- std::shared_ptr<ColumnDescriptor>& d2, std::shared_ptr<ColumnDescriptor>& d3) {
+ std::shared_ptr<ColumnDescriptor>& d2, std::shared_ptr<ColumnDescriptor>& d3,
+ int length) {
NodePtr type;
- type = schema::PrimitiveNode::Make("c1", Repetition::REQUIRED, Type::type_num);
+ type = schema::PrimitiveNode::Make("c1", Repetition::REQUIRED, Type::type_num,
+ LogicalType::NONE, length);
d1.reset(new ColumnDescriptor(type, 0, 0));
- type = schema::PrimitiveNode::Make("c2", Repetition::OPTIONAL, Type::type_num);
+ type = schema::PrimitiveNode::Make("c2", Repetition::OPTIONAL, Type::type_num,
+ LogicalType::NONE, length);
d2.reset(new ColumnDescriptor(type, 4, 0));
- type = schema::PrimitiveNode::Make("c3", Repetition::REPEATED, Type::type_num);
+ type = schema::PrimitiveNode::Make("c3", Repetition::REPEATED, Type::type_num,
+ LogicalType::NONE, length);
d3.reset(new ColumnDescriptor(type, 4, 2));
}
- void ExecuteAll(int num_pages, int num_levels, int batch_size) {
+ void ExecuteAll(int num_pages, int num_levels, int batch_size, int type_length) {
std::shared_ptr<ColumnDescriptor> d1;
std::shared_ptr<ColumnDescriptor> d2;
std::shared_ptr<ColumnDescriptor> d3;
- InitDescriptors(d1, d2, d3);
+ InitDescriptors(d1, d2, d3, type_length);
// evaluate REQUIRED pages
Execute(num_pages, num_levels, batch_size, d1.get());
// evaluate OPTIONAL pages
@@ -203,42 +207,71 @@ void TestFlatScanner<FLBAType>::InitValues() {
values_.data());
}
-template<>
-void TestFlatScanner<FLBAType>::InitDescriptors(
- std::shared_ptr<ColumnDescriptor>& d1, std::shared_ptr<ColumnDescriptor>& d2,
- std::shared_ptr<ColumnDescriptor>& d3) {
- NodePtr type = schema::PrimitiveNode::MakeFLBA("c1", Repetition::REQUIRED,
- FLBA_LENGTH, LogicalType::UTF8);
- d1.reset(new ColumnDescriptor(type, 0, 0));
- type = schema::PrimitiveNode::MakeFLBA("c2", Repetition::OPTIONAL,
- FLBA_LENGTH, LogicalType::UTF8);
- d2.reset(new ColumnDescriptor(type, 4, 0));
- type = schema::PrimitiveNode::MakeFLBA("c3", Repetition::REPEATED,
- FLBA_LENGTH, LogicalType::UTF8);
- d3.reset(new ColumnDescriptor(type, 4, 2));
-}
-
typedef TestFlatScanner<FLBAType> TestFlatFLBAScanner;
static int num_levels_per_page = 100;
static int num_pages = 20;
static int batch_size = 32;
-TYPED_TEST_CASE(TestFlatScanner, ParquetTypes);
+typedef ::testing::Types<BooleanType, Int32Type, Int64Type, Int96Type,
+ FloatType, DoubleType, ByteArrayType> TestTypes;
+
+typedef TestFlatScanner<FLBAType> TestFLBAFlatScanner;
+
+TYPED_TEST_CASE(TestFlatScanner, TestTypes);
TYPED_TEST(TestFlatScanner, TestScanner) {
- this->ExecuteAll(num_pages, num_levels_per_page, batch_size);
+ this->ExecuteAll(num_pages, num_levels_per_page, batch_size, 0);
+}
+
+TEST_F(TestFLBAFlatScanner, TestScanner) {
+ this->ExecuteAll(num_pages, num_levels_per_page, batch_size, FLBA_LENGTH);
}
//PARQUET 502
TEST_F(TestFlatFLBAScanner, TestSmallBatch) {
- NodePtr type = schema::PrimitiveNode::MakeFLBA("c1", Repetition::REQUIRED,
- FLBA_LENGTH, LogicalType::UTF8);
+ NodePtr type = schema::PrimitiveNode::Make("c1", Repetition::REQUIRED,
+ Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, FLBA_LENGTH, 10, 2);
const ColumnDescriptor d(type, 0, 0);
MakePages(&d, 1, 100);
InitScanner(&d);
CheckResults(1, &d);
}
+TEST_F(TestFlatFLBAScanner, TestDescriptorAPI) {
+ NodePtr type = schema::PrimitiveNode::Make("c1", Repetition::OPTIONAL,
+ Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, FLBA_LENGTH, 10, 2);
+ const ColumnDescriptor d(type, 4, 0);
+ MakePages(&d, 1, 100);
+ InitScanner(&d);
+ TypedScanner<FLBAType::type_num>* scanner =
+ reinterpret_cast<TypedScanner<FLBAType::type_num>* >(scanner_.get());
+ ASSERT_EQ(10, scanner->descr()->type_precision());
+ ASSERT_EQ(2, scanner->descr()->type_scale());
+ ASSERT_EQ(FLBA_LENGTH, scanner->descr()->type_length());
+}
+
+TEST_F(TestFlatFLBAScanner, TestFLBAPrinterNext) {
+ NodePtr type = schema::PrimitiveNode::Make("c1", Repetition::OPTIONAL,
+ Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, FLBA_LENGTH, 10, 2);
+ const ColumnDescriptor d(type, 4, 0);
+ MakePages(&d, 1, 100);
+ InitScanner(&d);
+ TypedScanner<FLBAType::type_num>* scanner =
+ reinterpret_cast<TypedScanner<FLBAType::type_num>* >(scanner_.get());
+ size_t j = 0;
+ scanner->SetBatchSize(batch_size);
+ std::stringstream ss_fail;
+ for (size_t i = 0; i < num_levels_; i++) {
+ std::stringstream ss;
+ scanner->PrintNext(ss, 17);
+ std::string result = ss.str();
+ ASSERT_LE(17, result.size()) << i;
+ }
+ ASSERT_THROW(scanner->PrintNext(ss_fail, 17), ParquetException);
+}
+
+//Test for GroupNode
+
} // namespace test
} // namespace parquet_cpp
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/column/scanner.h
----------------------------------------------------------------------
diff --git a/src/parquet/column/scanner.h b/src/parquet/column/scanner.h
index f3f5719..8569a94 100644
--- a/src/parquet/column/scanner.h
+++ b/src/parquet/column/scanner.h
@@ -45,8 +45,8 @@ class Scanner {
values_buffered_(0),
reader_(reader) {
// TODO: don't allocate for required fields
- def_levels_.resize(reader->descr()->is_optional() ? batch_size_ : 0);
- rep_levels_.resize(reader->descr()->is_repeated() ? batch_size_ : 0);
+ def_levels_.resize(descr()->max_definition_level() > 0 ? batch_size_ : 0);
+ rep_levels_.resize(descr()->max_repetition_level() > 0 ? batch_size_ : 0);
}
virtual ~Scanner() {}
@@ -114,10 +114,8 @@ class TypedScanner : public Scanner {
return false;
}
}
- *def_level = descr()->is_optional() ?
- def_levels_[level_offset_] : descr()->max_definition_level();
- *rep_level = descr()->is_repeated() ?
- rep_levels_[level_offset_] : descr()->max_repetition_level();
+ *def_level = descr()->max_definition_level() > 0 ? def_levels_[level_offset_] : 0;
+ *rep_level = descr()->max_repetition_level() > 0 ? rep_levels_[level_offset_] : 0;
level_offset_++;
return true;
}
@@ -172,10 +170,12 @@ class TypedScanner : public Scanner {
virtual void PrintNext(std::ostream& out, int width) {
T val;
- bool is_null;
-
+ bool is_null = false;
char buffer[25];
- NextValue(&val, &is_null);
+
+ if (!NextValue(&val, &is_null)) {
+ throw ParquetException("No more values buffered");
+ }
if (is_null) {
std::string null_fmt = format_fwf<Type::BYTE_ARRAY>(width);
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/encodings/encoding-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/encodings/encoding-test.cc b/src/parquet/encodings/encoding-test.cc
index 10310ed..f060f96 100644
--- a/src/parquet/encodings/encoding-test.cc
+++ b/src/parquet/encodings/encoding-test.cc
@@ -134,8 +134,8 @@ std::shared_ptr<ColumnDescriptor> ExampleDescr() {
template <>
std::shared_ptr<ColumnDescriptor> ExampleDescr<FLBA>() {
- auto node = schema::PrimitiveNode::MakeFLBA("name", Repetition::OPTIONAL,
- flba_length, LogicalType::UTF8);
+ auto node = schema::PrimitiveNode::Make("name", Repetition::OPTIONAL,
+ Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, flba_length, 10, 2);
return std::make_shared<ColumnDescriptor>(node, 0, 0);
}
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/schema/descriptor.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/descriptor.cc b/src/parquet/schema/descriptor.cc
index 02b5060..b3fefee 100644
--- a/src/parquet/schema/descriptor.cc
+++ b/src/parquet/schema/descriptor.cc
@@ -84,10 +84,15 @@ const ColumnDescriptor* SchemaDescriptor::Column(size_t i) const {
return &leaves_[i];
}
+int ColumnDescriptor::type_scale() const {
+ return primitive_node_->decimal_metadata().scale;
+}
+
+int ColumnDescriptor::type_precision() const {
+ return primitive_node_->decimal_metadata().precision;
+}
+
int ColumnDescriptor::type_length() const {
- if (primitive_node_->physical_type() != Type::FIXED_LEN_BYTE_ARRAY) {
- throw ParquetException("Not a FIXED_LEN_BYTE_ARRAY");
- }
return primitive_node_->type_length();
}
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/schema/descriptor.h
----------------------------------------------------------------------
diff --git a/src/parquet/schema/descriptor.h b/src/parquet/schema/descriptor.h
index 62066ef..4c6f50d 100644
--- a/src/parquet/schema/descriptor.h
+++ b/src/parquet/schema/descriptor.h
@@ -24,7 +24,6 @@
#include <string>
#include <unordered_map>
#include <vector>
-
#include "parquet/schema/types.h"
#include "parquet/types.h"
@@ -54,23 +53,19 @@ class ColumnDescriptor {
return primitive_node_->physical_type();
}
- const std::string& name() const {
- return primitive_node_->name();
+ LogicalType::type logical_type() const {
+ return primitive_node_->logical_type();
}
- bool is_required() const {
- return max_definition_level_ == 0;
+ const std::string& name() const {
+ return primitive_node_->name();
}
- bool is_optional() const {
- return max_definition_level_ > 0;
- }
+ int type_length() const;
- bool is_repeated() const {
- return max_repetition_level_ > 0;
- }
+ int type_precision() const;
- int type_length() const;
+ int type_scale() const;
private:
schema::NodePtr node_;
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/schema/schema-descriptor-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/schema-descriptor-test.cc b/src/parquet/schema/schema-descriptor-test.cc
index 3b1734b..519d968 100644
--- a/src/parquet/schema/schema-descriptor-test.cc
+++ b/src/parquet/schema/schema-descriptor-test.cc
@@ -46,11 +46,11 @@ TEST(TestColumnDescriptor, TestAttrs) {
ASSERT_EQ(Type::BYTE_ARRAY, descr.physical_type());
- ASSERT_THROW(descr.type_length(), ParquetException);
+ ASSERT_EQ(-1, descr.type_length());
// Test FIXED_LEN_BYTE_ARRAY
- node = PrimitiveNode::MakeFLBA("name", Repetition::OPTIONAL,
- 12, LogicalType::UTF8);
+ node = PrimitiveNode::Make("name", Repetition::OPTIONAL,
+ Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 12, 10, 4);
descr = ColumnDescriptor(node, 4, 1);
ASSERT_EQ(Type::FIXED_LEN_BYTE_ARRAY, descr.physical_type());
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/schema/schema-types-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/schema-types-test.cc b/src/parquet/schema/schema-types-test.cc
index cac7dc5..f512f0f 100644
--- a/src/parquet/schema/schema-types-test.cc
+++ b/src/parquet/schema/schema-types-test.cc
@@ -21,6 +21,7 @@
#include <string>
#include <vector>
+#include "parquet/exception.h"
#include "parquet/schema/test-util.h"
#include "parquet/schema/types.h"
#include "parquet/thrift/parquet_types.h"
@@ -130,15 +131,15 @@ TEST_F(TestPrimitiveNode, FromParquet) {
parquet::Type::FIXED_LEN_BYTE_ARRAY);
elt.__set_converted_type(ConvertedType::DECIMAL);
elt.__set_type_length(6);
- elt.__set_scale(12);
- elt.__set_precision(2);
+ elt.__set_scale(2);
+ elt.__set_precision(12);
Convert(&elt);
ASSERT_EQ(Type::FIXED_LEN_BYTE_ARRAY, prim_node_->physical_type());
ASSERT_EQ(LogicalType::DECIMAL, prim_node_->logical_type());
ASSERT_EQ(6, prim_node_->type_length());
- ASSERT_EQ(12, prim_node_->decimal_metadata().scale);
- ASSERT_EQ(2, prim_node_->decimal_metadata().precision);
+ ASSERT_EQ(2, prim_node_->decimal_metadata().scale);
+ ASSERT_EQ(12, prim_node_->decimal_metadata().precision);
}
TEST_F(TestPrimitiveNode, Equals) {
@@ -154,17 +155,66 @@ TEST_F(TestPrimitiveNode, Equals) {
ASSERT_FALSE(node1.Equals(&node4));
ASSERT_TRUE(node1.Equals(&node5));
- PrimitiveNode flba1("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY);
- flba1.SetTypeLength(12);
+ PrimitiveNode flba1("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY,
+ LogicalType::DECIMAL, 12, 4, 2);
- PrimitiveNode flba2("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY);
+ PrimitiveNode flba2("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY,
+ LogicalType::DECIMAL, 1, 4, 2);
flba2.SetTypeLength(12);
- PrimitiveNode flba3("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY);
+ PrimitiveNode flba3("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY,
+ LogicalType::DECIMAL, 1, 4, 2);
flba3.SetTypeLength(16);
+ PrimitiveNode flba4("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY,
+ LogicalType::DECIMAL, 12, 4, 0);
+
+ PrimitiveNode flba5("foo", Repetition::REQUIRED, Type::FIXED_LEN_BYTE_ARRAY,
+ LogicalType::NONE, 12, 4, 0);
+
ASSERT_TRUE(flba1.Equals(&flba2));
ASSERT_FALSE(flba1.Equals(&flba3));
+ ASSERT_FALSE(flba1.Equals(&flba4));
+ ASSERT_FALSE(flba1.Equals(&flba5));
+}
+
+TEST_F(TestPrimitiveNode, PhysicalLogicalMapping) {
+ ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+ Type::INT32, LogicalType::INT_32));
+ ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+ Type::BYTE_ARRAY, LogicalType::JSON));
+ ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+ Type::INT32, LogicalType::JSON), ParquetException);
+ ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+ Type::INT64, LogicalType::TIMESTAMP_MILLIS));
+ ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+ Type::INT32, LogicalType::INT_64), ParquetException);
+ ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+ Type::BYTE_ARRAY, LogicalType::INT_8), ParquetException);
+ ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+ Type::BYTE_ARRAY, LogicalType::INTERVAL), ParquetException);
+ ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+ Type::FIXED_LEN_BYTE_ARRAY, LogicalType::ENUM), ParquetException);
+ ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+ Type::BYTE_ARRAY, LogicalType::ENUM));
+ ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+ Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 0, 2, 4), ParquetException);
+ ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+ Type::FLOAT, LogicalType::DECIMAL, 0, 2, 4), ParquetException);
+ ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+ Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 0, 4, 0), ParquetException);
+ ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+ Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 10, 0, 4), ParquetException);
+ ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+ Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 10, 4, -1), ParquetException);
+ ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+ Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 10, 2, 4), ParquetException);
+ ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+ Type::FIXED_LEN_BYTE_ARRAY, LogicalType::DECIMAL, 10, 6, 4));
+ ASSERT_NO_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+ Type::FIXED_LEN_BYTE_ARRAY, LogicalType::INTERVAL, 12));
+ ASSERT_THROW(PrimitiveNode::Make("foo", Repetition::REQUIRED,
+ Type::FIXED_LEN_BYTE_ARRAY, LogicalType::INTERVAL, 10), ParquetException);
}
// ----------------------------------------------------------------------
@@ -220,11 +270,14 @@ TEST_F(TestGroupNode, Equals) {
// This is copied in the GroupNode ctor, so this is okay
f2.push_back(Float("four", Repetition::OPTIONAL));
GroupNode group4("group", Repetition::REPEATED, f2);
+ GroupNode group5("group", Repetition::REPEATED, Fields1());
+ ASSERT_TRUE(group1.Equals(&group1));
ASSERT_TRUE(group1.Equals(&group2));
ASSERT_FALSE(group1.Equals(&group3));
ASSERT_FALSE(group1.Equals(&group4));
+ ASSERT_FALSE(group5.Equals(&group4));
}
} // namespace schema
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/schema/types.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/types.cc b/src/parquet/schema/types.cc
index fae7c84..b57fd37 100644
--- a/src/parquet/schema/types.cc
+++ b/src/parquet/schema/types.cc
@@ -40,17 +40,117 @@ bool Node::EqualsInternal(const Node* other) const {
// ----------------------------------------------------------------------
// Primitive node
+PrimitiveNode::PrimitiveNode(const std::string& name, Repetition::type repetition,
+ Type::type type, LogicalType::type logical_type,
+ int length, int precision, int scale, int id) :
+ Node(Node::PRIMITIVE, name, repetition, logical_type, id),
+ physical_type_(type), type_length_(length) {
+ std::stringstream ss;
+ // Check if the physical and logical types match
+ // Mapping referred from Apache parquet-mr as on 2016-02-22
+ switch (logical_type) {
+ case LogicalType::NONE:
+ // Logical type not set
+ // Clients should be able to read these values
+ decimal_metadata_.precision = precision;
+ decimal_metadata_.scale = scale;
+ break;
+ case LogicalType::UTF8:
+ case LogicalType::JSON:
+ case LogicalType::BSON:
+ if (type != Type::BYTE_ARRAY) {
+ ss << logical_type_to_string(logical_type);
+ ss << " can only annotate BYTE_ARRAY fields";
+ throw ParquetException(ss.str());
+ }
+ break;
+ case LogicalType::DECIMAL:
+ if ((type != Type::INT32) &&
+ (type != Type::INT64) &&
+ (type != Type::BYTE_ARRAY) &&
+ (type != Type::FIXED_LEN_BYTE_ARRAY)) {
+ ss << "DECIMAL can only annotate INT32, INT64, BYTE_ARRAY, and FIXED";
+ throw ParquetException(ss.str());
+ }
+ if (precision <= 0) {
+ ss << "Invalid DECIMAL precision: " << precision;
+ throw ParquetException(ss.str());
+ }
+ if (scale < 0) {
+ ss << "Invalid DECIMAL scale: " << scale;
+ throw ParquetException(ss.str());
+ }
+ if (scale > precision) {
+ ss << "Invalid DECIMAL scale " << scale;
+ ss << " cannot be greater than precision " << precision;
+ throw ParquetException(ss.str());
+ }
+ decimal_metadata_.precision = precision;
+ decimal_metadata_.scale = scale;
+ break;
+ case LogicalType::DATE:
+ case LogicalType::TIME_MILLIS:
+ case LogicalType::UINT_8:
+ case LogicalType::UINT_16:
+ case LogicalType::UINT_32:
+ case LogicalType::INT_8:
+ case LogicalType::INT_16:
+ case LogicalType::INT_32:
+ if (type != Type::INT32) {
+ ss << logical_type_to_string(logical_type);
+ ss << " can only annotate INT32";
+ throw ParquetException(ss.str());
+ }
+ break;
+ case LogicalType::TIMESTAMP_MILLIS:
+ case LogicalType::UINT_64:
+ case LogicalType::INT_64:
+ if (type != Type::INT64) {
+ ss << logical_type_to_string(logical_type);
+ ss << " can only annotate INT64";
+ throw ParquetException(ss.str());
+ }
+ break;
+ case LogicalType::INTERVAL:
+ if ((type != Type::FIXED_LEN_BYTE_ARRAY) || (length != 12)) {
+ ss << "INTERVAL can only annotate FIXED_LEN_BYTE_ARRAY(12)";
+ throw ParquetException(ss.str());
+ }
+ break;
+ case LogicalType::ENUM:
+ if (type != Type::BYTE_ARRAY) {
+ ss << "ENUM can only annotate BYTE_ARRAY fields";
+ throw ParquetException(ss.str());
+ }
+ break;
+ default:
+ ss << logical_type_to_string(logical_type);
+ ss << " can not be applied to a primitive type";
+ throw ParquetException(ss.str());
+ }
+ if (type == Type::FIXED_LEN_BYTE_ARRAY) {
+ if (length <= 0) {
+ ss << "Invalid FIXED_LEN_BYTE_ARRAY length: " << length;
+ throw ParquetException(ss.str());
+ }
+ type_length_ = length;
+ }
+}
+
bool PrimitiveNode::EqualsInternal(const PrimitiveNode* other) const {
- if (physical_type_ != other->physical_type_) {
- return false;
- } else if (logical_type_ == LogicalType::DECIMAL) {
- // TODO(wesm): metadata
- ParquetException::NYI("comparing decimals");
+ bool is_equal = true;
+ if ((physical_type_ != other->physical_type_) ||
+ (logical_type_ != other->logical_type_)) {
return false;
- } else if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) {
- return type_length_ == other->type_length_;
}
- return true;
+ if (logical_type_ == LogicalType::DECIMAL) {
+ is_equal &= (decimal_metadata_.precision == other->decimal_metadata_.precision) &&
+ (decimal_metadata_.scale == other->decimal_metadata_.scale);
+ }
+ if (physical_type_ == Type::FIXED_LEN_BYTE_ARRAY) {
+ is_equal &= (type_length_ == other->type_length_);
+ }
+ return is_equal;
}
bool PrimitiveNode::Equals(const Node* other) const {
@@ -134,14 +234,8 @@ std::unique_ptr<Node> PrimitiveNode::FromParquet(const void* opaque_element,
std::unique_ptr<PrimitiveNode> result = std::unique_ptr<PrimitiveNode>(
new PrimitiveNode(params.name, params.repetition,
- FromThrift(element->type), params.logical_type, node_id));
-
- if (element->type == parquet::Type::FIXED_LEN_BYTE_ARRAY) {
- result->SetTypeLength(element->type_length);
- if (params.logical_type == LogicalType::DECIMAL) {
- result->SetDecimalMetadata(element->scale, element->precision);
- }
- }
+ FromThrift(element->type), params.logical_type,
+ element->type_length, element->precision, element->scale, node_id));
// Return as unique_ptr to the base type
return std::unique_ptr<Node>(result.release());
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/schema/types.h
----------------------------------------------------------------------
diff --git a/src/parquet/schema/types.h b/src/parquet/schema/types.h
index e76323f..2eaee8b 100644
--- a/src/parquet/schema/types.h
+++ b/src/parquet/schema/types.h
@@ -177,17 +177,10 @@ class PrimitiveNode : public Node {
static inline NodePtr Make(const std::string& name,
Repetition::type repetition, Type::type type,
- LogicalType::type logical_type = LogicalType::NONE) {
- return NodePtr(new PrimitiveNode(name, repetition, type, logical_type));
- }
-
- // Alternate constructor for FIXED_LEN_BYTE_ARRAY (FLBA)
- static inline NodePtr MakeFLBA(const std::string& name,
- Repetition::type repetition, int32_t type_length,
- LogicalType::type logical_type = LogicalType::NONE) {
- NodePtr result = Make(name, repetition, Type::FIXED_LEN_BYTE_ARRAY, logical_type);
- static_cast<PrimitiveNode*>(result.get())->SetTypeLength(type_length);
- return result;
+ LogicalType::type logical_type = LogicalType::NONE,
+ int length = -1, int precision = -1, int scale = -1) {
+ return NodePtr(new PrimitiveNode(name, repetition, type, logical_type,
+ length, precision, scale));
}
virtual bool Equals(const Node* other) const;
@@ -208,11 +201,8 @@ class PrimitiveNode : public Node {
private:
PrimitiveNode(const std::string& name, Repetition::type repetition,
- Type::type type,
- LogicalType::type logical_type = LogicalType::NONE,
- int id = -1) :
- Node(Node::PRIMITIVE, name, repetition, logical_type, id),
- physical_type_(type) {}
+ Type::type type, LogicalType::type logical_type = LogicalType::NONE,
+ int length = -1, int precision = -1, int scale = -1, int id = -1);
Type::type physical_type_;
int32_t type_length_;
@@ -234,6 +224,7 @@ class PrimitiveNode : public Node {
FRIEND_TEST(TestPrimitiveNode, Attrs);
FRIEND_TEST(TestPrimitiveNode, Equals);
+ FRIEND_TEST(TestPrimitiveNode, PhysicalLogicalMapping);
FRIEND_TEST(TestPrimitiveNode, FromParquet);
};
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/types-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/types-test.cc b/src/parquet/types-test.cc
new file mode 100644
index 0000000..b1bfacd
--- /dev/null
+++ b/src/parquet/types-test.cc
@@ -0,0 +1,82 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <string>
+
+#include "parquet/types.h"
+
+namespace parquet_cpp {
+
+TEST(TestTypeToString, PhysicalTypes) {
+ ASSERT_STREQ("BOOLEAN", type_to_string(Type::BOOLEAN).c_str());
+ ASSERT_STREQ("INT32", type_to_string(Type::INT32).c_str());
+ ASSERT_STREQ("INT64", type_to_string(Type::INT64).c_str());
+ ASSERT_STREQ("INT96", type_to_string(Type::INT96).c_str());
+ ASSERT_STREQ("FLOAT", type_to_string(Type::FLOAT).c_str());
+ ASSERT_STREQ("DOUBLE", type_to_string(Type::DOUBLE).c_str());
+ ASSERT_STREQ("BYTE_ARRAY", type_to_string(Type::BYTE_ARRAY).c_str());
+ ASSERT_STREQ("FIXED_LEN_BYTE_ARRAY",
+ type_to_string(Type::FIXED_LEN_BYTE_ARRAY).c_str());
+}
+
+TEST(TestLogicalTypeToString, LogicalTypes) {
+ ASSERT_STREQ("NONE",
+ logical_type_to_string(LogicalType::NONE).c_str());
+ ASSERT_STREQ("UTF8",
+ logical_type_to_string(LogicalType::UTF8).c_str());
+ ASSERT_STREQ("MAP_KEY_VALUE",
+ logical_type_to_string(LogicalType::MAP_KEY_VALUE).c_str());
+ ASSERT_STREQ("LIST",
+ logical_type_to_string(LogicalType::LIST).c_str());
+ ASSERT_STREQ("ENUM",
+ logical_type_to_string(LogicalType::ENUM).c_str());
+ ASSERT_STREQ("DECIMAL",
+ logical_type_to_string(LogicalType::DECIMAL).c_str());
+ ASSERT_STREQ("DATE",
+ logical_type_to_string(LogicalType::DATE).c_str());
+ ASSERT_STREQ("TIME_MILLIS",
+ logical_type_to_string(LogicalType::TIME_MILLIS).c_str());
+ ASSERT_STREQ("TIMESTAMP_MILLIS",
+ logical_type_to_string(LogicalType::TIMESTAMP_MILLIS).c_str());
+ ASSERT_STREQ("UINT_8",
+ logical_type_to_string(LogicalType::UINT_8).c_str());
+ ASSERT_STREQ("UINT_16",
+ logical_type_to_string(LogicalType::UINT_16).c_str());
+ ASSERT_STREQ("UINT_32",
+ logical_type_to_string(LogicalType::UINT_32).c_str());
+ ASSERT_STREQ("UINT_64",
+ logical_type_to_string(LogicalType::UINT_64).c_str());
+ ASSERT_STREQ("INT_8",
+ logical_type_to_string(LogicalType::INT_8).c_str());
+ ASSERT_STREQ("INT_16",
+ logical_type_to_string(LogicalType::INT_16).c_str());
+ ASSERT_STREQ("INT_32",
+ logical_type_to_string(LogicalType::INT_32).c_str());
+ ASSERT_STREQ("INT_64",
+ logical_type_to_string(LogicalType::INT_64).c_str());
+ ASSERT_STREQ("JSON",
+ logical_type_to_string(LogicalType::JSON).c_str());
+ ASSERT_STREQ("BSON",
+ logical_type_to_string(LogicalType::BSON).c_str());
+ ASSERT_STREQ("INTERVAL",
+ logical_type_to_string(LogicalType::INTERVAL).c_str());
+}
+
+
+} // namespace parquet_cpp
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/ebb45b1e/src/parquet/types.h
----------------------------------------------------------------------
diff --git a/src/parquet/types.h b/src/parquet/types.h
index f59f6a9..e29f11c 100644
--- a/src/parquet/types.h
+++ b/src/parquet/types.h
@@ -316,6 +316,73 @@ static inline std::string type_to_string(Type::type t) {
}
}
+static inline std::string logical_type_to_string(LogicalType::type t) {
+ switch (t) {
+ case LogicalType::NONE:
+ return "NONE";
+ break;
+ case LogicalType::UTF8:
+ return "UTF8";
+ break;
+ case LogicalType::MAP_KEY_VALUE:
+ return "MAP_KEY_VALUE";
+ break;
+ case LogicalType::LIST:
+ return "LIST";
+ break;
+ case LogicalType::ENUM:
+ return "ENUM";
+ break;
+ case LogicalType::DECIMAL:
+ return "DECIMAL";
+ break;
+ case LogicalType::DATE:
+ return "DATE";
+ break;
+ case LogicalType::TIME_MILLIS:
+ return "TIME_MILLIS";
+ break;
+ case LogicalType::TIMESTAMP_MILLIS:
+ return "TIMESTAMP_MILLIS";
+ break;
+ case LogicalType::UINT_8:
+ return "UINT_8";
+ break;
+ case LogicalType::UINT_16:
+ return "UINT_16";
+ break;
+ case LogicalType::UINT_32:
+ return "UINT_32";
+ break;
+ case LogicalType::UINT_64:
+ return "UINT_64";
+ break;
+ case LogicalType::INT_8:
+ return "INT_8";
+ break;
+ case LogicalType::INT_16:
+ return "INT_16";
+ break;
+ case LogicalType::INT_32:
+ return "INT_32";
+ break;
+ case LogicalType::INT_64:
+ return "INT_64";
+ break;
+ case LogicalType::JSON:
+ return "JSON";
+ break;
+ case LogicalType::BSON:
+ return "BSON";
+ break;
+ case LogicalType::INTERVAL:
+ return "INTERVAL";
+ break;
+ default:
+ return "UNKNOWN";
+ break;
+ }
+}
} // namespace parquet_cpp
#endif // PARQUET_TYPES_H