You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by uw...@apache.org on 2016/12/21 13:55:29 UTC
parquet-cpp git commit: PARQUET-812: Read BYTE_ARRAY with no logical
type as arrow::BinaryArray
Repository: parquet-cpp
Updated Branches:
refs/heads/master 16466b109 -> d7625e9ad
PARQUET-812: Read BYTE_ARRAY with no logical type as arrow::BinaryArray
Depends on ARROW-374. Need to merge that and update the thirdparty git hash
Author: Wes McKinney <we...@twosigma.com>
Closes #206 from wesm/PARQUET-812 and squashes the following commits:
73fb8d0 [Wes McKinney] Update thirdparty arrow version
5db7908 [Wes McKinney] typo
fc0d559 [Wes McKinney] Read unadorned BYTE_ARRAY into arrow::BinaryArray
Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/d7625e9a
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/d7625e9a
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/d7625e9a
Branch: refs/heads/master
Commit: d7625e9ad6417485d6e8aa8432cf342a532ab3c3
Parents: 16466b1
Author: Wes McKinney <we...@twosigma.com>
Authored: Wed Dec 21 14:54:53 2016 +0100
Committer: Uwe L. Korn <uw...@xhochy.com>
Committed: Wed Dec 21 14:54:53 2016 +0100
----------------------------------------------------------------------
src/parquet/arrow/arrow-reader-writer-test.cc | 15 ++++--
src/parquet/arrow/arrow-schema-test.cc | 29 +++++------
src/parquet/arrow/reader.cc | 26 ++++++++--
src/parquet/arrow/schema.cc | 58 ++++++++--------------
src/parquet/arrow/test-util.h | 21 +++++---
src/parquet/arrow/utils.h | 10 ++--
src/parquet/arrow/writer.cc | 15 +++---
src/parquet/column/column-writer-test.cc | 3 +-
src/parquet/compression/brotli-codec.cc | 4 +-
src/parquet/compression/codec.h | 4 +-
src/parquet/schema/printer.cc | 11 ++--
src/parquet/schema/schema-printer-test.cc | 4 +-
thirdparty/versions.sh | 2 +-
13 files changed, 108 insertions(+), 94 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/d7625e9a/src/parquet/arrow/arrow-reader-writer-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc
index a8a5db0..6d2b0d5 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -167,7 +167,15 @@ struct test_traits<::arrow::StringType> {
static std::string const value;
};
+template <>
+struct test_traits<::arrow::BinaryType> {
+ static constexpr ParquetType::type parquet_enum = ParquetType::BYTE_ARRAY;
+ static constexpr LogicalType::type logical_enum = LogicalType::NONE;
+ static std::string const value;
+};
+
const std::string test_traits<::arrow::StringType>::value("Test");
+const std::string test_traits<::arrow::BinaryType>::value("\x00\x01\x02\x03");
template <typename T>
using ParquetDataType = DataType<test_traits<T>::parquet_enum>;
@@ -247,7 +255,7 @@ class TestParquetIO : public ::testing::Test {
std::shared_ptr<InMemoryOutputStream> sink_;
};
-// We habe separate tests for UInt32Type as this is currently the only type
+// We have separate tests for UInt32Type as this is currently the only type
// where a roundtrip does not yield the identical Array structure.
// There we write an UInt32 Array but receive an Int64 Array as result for
// Parquet version 1.0.
@@ -255,7 +263,7 @@ class TestParquetIO : public ::testing::Test {
typedef ::testing::Types<::arrow::BooleanType, ::arrow::UInt8Type, ::arrow::Int8Type,
::arrow::UInt16Type, ::arrow::Int16Type, ::arrow::Int32Type, ::arrow::UInt64Type,
::arrow::Int64Type, ::arrow::TimestampType, ::arrow::FloatType, ::arrow::DoubleType,
- ::arrow::StringType>
+ ::arrow::StringType, ::arrow::BinaryType>
TestTypes;
TYPED_TEST_CASE(TestParquetIO, TestTypes);
@@ -504,8 +512,7 @@ using TestStringParquetIO = TestParquetIO<::arrow::StringType>;
TEST_F(TestStringParquetIO, EmptyStringColumnRequiredWrite) {
std::shared_ptr<Array> values;
- ::arrow::StringBuilder builder(
- ::arrow::default_memory_pool(), std::make_shared<::arrow::StringType>());
+ ::arrow::StringBuilder builder(::arrow::default_memory_pool(), ::arrow::utf8());
for (size_t i = 0; i < SMALL_SIZE; i++) {
builder.Append("");
}
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/d7625e9a/src/parquet/arrow/arrow-schema-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/arrow-schema-test.cc b/src/parquet/arrow/arrow-schema-test.cc
index 3d07561..43e57d8 100644
--- a/src/parquet/arrow/arrow-schema-test.cc
+++ b/src/parquet/arrow/arrow-schema-test.cc
@@ -38,21 +38,19 @@ namespace parquet {
namespace arrow {
-const auto BOOL = std::make_shared<::arrow::BooleanType>();
-const auto UINT8 = std::make_shared<::arrow::UInt8Type>();
-const auto INT32 = std::make_shared<::arrow::Int32Type>();
-const auto INT64 = std::make_shared<::arrow::Int64Type>();
-const auto FLOAT = std::make_shared<::arrow::FloatType>();
-const auto DOUBLE = std::make_shared<::arrow::DoubleType>();
-const auto UTF8 = std::make_shared<::arrow::StringType>();
-const auto TIMESTAMP_MS =
- std::make_shared<::arrow::TimestampType>(::arrow::TimestampType::Unit::MILLI);
-const auto TIMESTAMP_NS =
- std::make_shared<::arrow::TimestampType>(::arrow::TimestampType::Unit::NANO);
+const auto BOOL = ::arrow::boolean();
+const auto UINT8 = ::arrow::uint8();
+const auto INT32 = ::arrow::int32();
+const auto INT64 = ::arrow::int64();
+const auto FLOAT = ::arrow::float32();
+const auto DOUBLE = ::arrow::float64();
+const auto UTF8 = ::arrow::utf8();
+const auto TIMESTAMP_MS = ::arrow::timestamp(::arrow::TimeUnit::MILLI);
+const auto TIMESTAMP_NS = ::arrow::timestamp(::arrow::TimeUnit::NANO);
+
// TODO: This requires parquet-cpp implementing the MICROS enum value
// const auto TIMESTAMP_US = std::make_shared<TimestampType>(TimestampType::Unit::MICRO);
-const auto BINARY =
- std::make_shared<::arrow::ListType>(std::make_shared<Field>("", UINT8));
+const auto BINARY = ::arrow::binary();
const auto DECIMAL_8_4 = std::make_shared<::arrow::DecimalType>(8, 4);
class TestConvertParquetSchema : public ::testing::Test {
@@ -412,11 +410,14 @@ TEST_F(TestConvertArrowSchema, ParquetFlatPrimitives) {
PrimitiveNode::Make("double", Repetition::OPTIONAL, ParquetType::DOUBLE));
arrow_fields.push_back(std::make_shared<Field>("double", DOUBLE));
- // TODO: String types need to be clarified a bit more in the Arrow spec
parquet_fields.push_back(PrimitiveNode::Make(
"string", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, LogicalType::UTF8));
arrow_fields.push_back(std::make_shared<Field>("string", UTF8));
+ parquet_fields.push_back(PrimitiveNode::Make(
+ "binary", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, LogicalType::NONE));
+ arrow_fields.push_back(std::make_shared<Field>("binary", BINARY));
+
ASSERT_OK(ConvertSchema(arrow_fields));
CheckFlatSchema(parquet_fields);
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/d7625e9a/src/parquet/arrow/reader.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/reader.cc b/src/parquet/arrow/reader.cc
index 2efa806..135867c 100644
--- a/src/parquet/arrow/reader.cc
+++ b/src/parquet/arrow/reader.cc
@@ -28,6 +28,7 @@
#include "parquet/arrow/utils.h"
#include "arrow/api.h"
+#include "arrow/type_traits.h"
#include "arrow/util/bit-util.h"
using arrow::Array;
@@ -90,10 +91,14 @@ class FlatColumnReader::Impl {
virtual ~Impl() {}
Status NextBatch(int batch_size, std::shared_ptr<Array>* out);
+
template <typename ArrowType, typename ParquetType>
Status TypedReadBatch(int batch_size, std::shared_ptr<Array>* out);
template <typename ArrowType>
+ Status ReadByteArrayBatch(int batch_size, std::shared_ptr<Array>* out);
+
+ template <typename ArrowType>
Status InitDataBuffer(int batch_size);
template <typename ArrowType, typename ParquetType>
void ReadNullableFlatBatch(const int16_t* def_levels,
@@ -486,11 +491,13 @@ Status FlatColumnReader::Impl::TypedReadBatch<::arrow::BooleanType, BooleanType>
}
}
-template <>
-Status FlatColumnReader::Impl::TypedReadBatch<::arrow::StringType, ByteArrayType>(
+template <typename ArrowType>
+Status FlatColumnReader::Impl::ReadByteArrayBatch(
int batch_size, std::shared_ptr<Array>* out) {
+ using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
+
int values_to_read = batch_size;
- ::arrow::StringBuilder builder(pool_, field_->type);
+ BuilderType builder(pool_, field_->type);
while ((values_to_read > 0) && column_reader_) {
values_buffer_.Resize(values_to_read * sizeof(ByteArray));
if (descr_->max_definition_level() > 0) {
@@ -528,6 +535,18 @@ Status FlatColumnReader::Impl::TypedReadBatch<::arrow::StringType, ByteArrayType
return builder.Finish(out);
}
+template <>
+Status FlatColumnReader::Impl::TypedReadBatch<::arrow::BinaryType, ByteArrayType>(
+ int batch_size, std::shared_ptr<Array>* out) {
+ return ReadByteArrayBatch<::arrow::BinaryType>(batch_size, out);
+}
+
+template <>
+Status FlatColumnReader::Impl::TypedReadBatch<::arrow::StringType, ByteArrayType>(
+ int batch_size, std::shared_ptr<Array>* out) {
+ return ReadByteArrayBatch<::arrow::StringType>(batch_size, out);
+}
+
#define TYPED_BATCH_CASE(ENUM, ArrowType, ParquetType) \
case ::arrow::Type::ENUM: \
return TypedReadBatch<ArrowType, ParquetType>(batch_size, out); \
@@ -553,6 +572,7 @@ Status FlatColumnReader::Impl::NextBatch(int batch_size, std::shared_ptr<Array>*
TYPED_BATCH_CASE(FLOAT, ::arrow::FloatType, FloatType)
TYPED_BATCH_CASE(DOUBLE, ::arrow::DoubleType, DoubleType)
TYPED_BATCH_CASE(STRING, ::arrow::StringType, ByteArrayType)
+ TYPED_BATCH_CASE(BINARY, ::arrow::BinaryType, ByteArrayType)
case ::arrow::Type::TIMESTAMP: {
::arrow::TimestampType* timestamp_type =
static_cast<::arrow::TimestampType*>(field_->type.get());
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/d7625e9a/src/parquet/arrow/schema.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/schema.cc b/src/parquet/arrow/schema.cc
index e578ec2..3e5e7d9 100644
--- a/src/parquet/arrow/schema.cc
+++ b/src/parquet/arrow/schema.cc
@@ -44,24 +44,8 @@ namespace parquet {
namespace arrow {
-const auto BOOL = std::make_shared<::arrow::BooleanType>();
-const auto UINT8 = std::make_shared<::arrow::UInt8Type>();
-const auto INT8 = std::make_shared<::arrow::Int8Type>();
-const auto UINT16 = std::make_shared<::arrow::UInt16Type>();
-const auto INT16 = std::make_shared<::arrow::Int16Type>();
-const auto UINT32 = std::make_shared<::arrow::UInt32Type>();
-const auto INT32 = std::make_shared<::arrow::Int32Type>();
-const auto UINT64 = std::make_shared<::arrow::UInt64Type>();
-const auto INT64 = std::make_shared<::arrow::Int64Type>();
-const auto FLOAT = std::make_shared<::arrow::FloatType>();
-const auto DOUBLE = std::make_shared<::arrow::DoubleType>();
-const auto UTF8 = std::make_shared<::arrow::StringType>();
-const auto TIMESTAMP_MS =
- std::make_shared<::arrow::TimestampType>(::arrow::TimestampType::Unit::MILLI);
-const auto TIMESTAMP_NS =
- std::make_shared<::arrow::TimestampType>(::arrow::TimestampType::Unit::NANO);
-const auto BINARY =
- std::make_shared<::arrow::ListType>(std::make_shared<::arrow::Field>("", UINT8));
+const auto TIMESTAMP_MS = ::arrow::timestamp(::arrow::TimeUnit::MILLI);
+const auto TIMESTAMP_NS = ::arrow::timestamp(::arrow::TimeUnit::NANO);
TypePtr MakeDecimalType(const PrimitiveNode* node) {
int precision = node->decimal_metadata().precision;
@@ -72,14 +56,14 @@ TypePtr MakeDecimalType(const PrimitiveNode* node) {
static Status FromByteArray(const PrimitiveNode* node, TypePtr* out) {
switch (node->logical_type()) {
case LogicalType::UTF8:
- *out = UTF8;
+ *out = ::arrow::utf8();
break;
case LogicalType::DECIMAL:
*out = MakeDecimalType(node);
break;
default:
// BINARY
- *out = BINARY;
+ *out = ::arrow::binary();
break;
}
return Status::OK();
@@ -88,7 +72,7 @@ static Status FromByteArray(const PrimitiveNode* node, TypePtr* out) {
static Status FromFLBA(const PrimitiveNode* node, TypePtr* out) {
switch (node->logical_type()) {
case LogicalType::NONE:
- *out = BINARY;
+ *out = ::arrow::binary();
break;
case LogicalType::DECIMAL:
*out = MakeDecimalType(node);
@@ -104,22 +88,22 @@ static Status FromFLBA(const PrimitiveNode* node, TypePtr* out) {
static Status FromInt32(const PrimitiveNode* node, TypePtr* out) {
switch (node->logical_type()) {
case LogicalType::NONE:
- *out = INT32;
+ *out = ::arrow::int32();
break;
case LogicalType::UINT_8:
- *out = UINT8;
+ *out = ::arrow::uint8();
break;
case LogicalType::INT_8:
- *out = INT8;
+ *out = ::arrow::int8();
break;
case LogicalType::UINT_16:
- *out = UINT16;
+ *out = ::arrow::uint16();
break;
case LogicalType::INT_16:
- *out = INT16;
+ *out = ::arrow::int16();
break;
case LogicalType::UINT_32:
- *out = UINT32;
+ *out = ::arrow::uint32();
break;
case LogicalType::DECIMAL:
*out = MakeDecimalType(node);
@@ -134,10 +118,10 @@ static Status FromInt32(const PrimitiveNode* node, TypePtr* out) {
static Status FromInt64(const PrimitiveNode* node, TypePtr* out) {
switch (node->logical_type()) {
case LogicalType::NONE:
- *out = INT64;
+ *out = ::arrow::int64();
break;
case LogicalType::UINT_64:
- *out = UINT64;
+ *out = ::arrow::uint64();
break;
case LogicalType::DECIMAL:
*out = MakeDecimalType(node);
@@ -155,7 +139,7 @@ static Status FromInt64(const PrimitiveNode* node, TypePtr* out) {
Status FromPrimitive(const PrimitiveNode* primitive, TypePtr* out) {
switch (primitive->physical_type()) {
case ParquetType::BOOLEAN:
- *out = BOOL;
+ *out = ::arrow::boolean();
break;
case ParquetType::INT32:
RETURN_NOT_OK(FromInt32(primitive, out));
@@ -167,13 +151,12 @@ Status FromPrimitive(const PrimitiveNode* primitive, TypePtr* out) {
*out = TIMESTAMP_NS;
break;
case ParquetType::FLOAT:
- *out = FLOAT;
+ *out = ::arrow::float32();
break;
case ParquetType::DOUBLE:
- *out = DOUBLE;
+ *out = ::arrow::float64();
break;
case ParquetType::BYTE_ARRAY:
- // TODO: Do we have that type in Arrow?
RETURN_NOT_OK(FromByteArray(primitive, out));
break;
case ParquetType::FIXED_LEN_BYTE_ARRAY:
@@ -211,13 +194,13 @@ Status NodeToList(const GroupNode* group, TypePtr* out) {
// List of primitive type
std::shared_ptr<Field> item_field;
RETURN_NOT_OK(NodeToField(list_group->field(0), &item_field));
- *out = std::make_shared<::arrow::ListType>(item_field);
+ *out = ::arrow::list(item_field);
} else {
// List of struct
std::shared_ptr<::arrow::DataType> inner_type;
RETURN_NOT_OK(StructFromGroup(list_group, &inner_type));
auto item_field = std::make_shared<Field>(list_node->name(), inner_type, false);
- *out = std::make_shared<::arrow::ListType>(item_field);
+ *out = ::arrow::list(item_field);
}
} else if (list_node->is_repeated()) {
// repeated primitive node
@@ -225,7 +208,7 @@ Status NodeToList(const GroupNode* group, TypePtr* out) {
const PrimitiveNode* primitive = static_cast<const PrimitiveNode*>(list_node.get());
RETURN_NOT_OK(FromPrimitive(primitive, &inner_type));
auto item_field = std::make_shared<Field>(list_node->name(), inner_type, false);
- *out = std::make_shared<::arrow::ListType>(item_field);
+ *out = ::arrow::list(item_field);
} else {
return Status::NotImplemented(
"Non-repeated groups in a LIST-annotated group are not supported.");
@@ -247,7 +230,7 @@ Status NodeToField(const NodePtr& node, std::shared_ptr<Field>* out) {
const PrimitiveNode* primitive = static_cast<const PrimitiveNode*>(node.get());
RETURN_NOT_OK(FromPrimitive(primitive, &inner_type));
auto item_field = std::make_shared<Field>(node->name(), inner_type, false);
- type = std::make_shared<::arrow::ListType>(item_field);
+ type = ::arrow::list(item_field);
nullable = false;
} else if (node->is_group()) {
const GroupNode* group = static_cast<const GroupNode*>(node.get());
@@ -423,5 +406,4 @@ Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
}
} // namespace arrow
-
} // namespace parquet
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/d7625e9a/src/parquet/arrow/test-util.h
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/test-util.h b/src/parquet/arrow/test-util.h
index bdb2bab..f996c2c 100644
--- a/src/parquet/arrow/test-util.h
+++ b/src/parquet/arrow/test-util.h
@@ -37,6 +37,9 @@ template <typename ArrowType>
using is_arrow_string = std::is_same<ArrowType, ::arrow::StringType>;
template <typename ArrowType>
+using is_arrow_binary = std::is_same<ArrowType, ::arrow::BinaryType>;
+
+template <typename ArrowType>
using is_arrow_bool = std::is_same<ArrowType, ::arrow::BooleanType>;
template <class ArrowType>
@@ -62,10 +65,11 @@ typename std::enable_if<is_arrow_int<ArrowType>::value, Status>::type NonNullArr
}
template <class ArrowType>
-typename std::enable_if<is_arrow_string<ArrowType>::value, Status>::type NonNullArray(
- size_t size, std::shared_ptr<Array>* out) {
- ::arrow::StringBuilder builder(
- ::arrow::default_memory_pool(), std::make_shared<::arrow::StringType>());
+typename std::enable_if<
+ is_arrow_string<ArrowType>::value || is_arrow_binary<ArrowType>::value, Status>::type
+NonNullArray(size_t size, std::shared_ptr<Array>* out) {
+ using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
+ BuilderType builder(::arrow::default_memory_pool(), std::make_shared<ArrowType>());
for (size_t i = 0; i < size; i++) {
builder.Append("test-string");
}
@@ -121,16 +125,17 @@ typename std::enable_if<is_arrow_int<ArrowType>::value, Status>::type NullableAr
// This helper function only supports (size/2) nulls yet.
template <typename ArrowType>
-typename std::enable_if<is_arrow_string<ArrowType>::value, Status>::type NullableArray(
- size_t size, size_t num_nulls, std::shared_ptr<::arrow::Array>* out) {
+typename std::enable_if<
+ is_arrow_string<ArrowType>::value || is_arrow_binary<ArrowType>::value, Status>::type
+NullableArray(size_t size, size_t num_nulls, std::shared_ptr<::arrow::Array>* out) {
std::vector<uint8_t> valid_bytes(size, 1);
for (size_t i = 0; i < num_nulls; i++) {
valid_bytes[i * 2] = 0;
}
- ::arrow::StringBuilder builder(
- ::arrow::default_memory_pool(), std::make_shared<::arrow::StringType>());
+ using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
+ BuilderType builder(::arrow::default_memory_pool(), std::make_shared<ArrowType>());
for (size_t i = 0; i < size; i++) {
builder.Append("test-string");
}
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/d7625e9a/src/parquet/arrow/utils.h
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/utils.h b/src/parquet/arrow/utils.h
index 015421b..9c2abfa 100644
--- a/src/parquet/arrow/utils.h
+++ b/src/parquet/arrow/utils.h
@@ -26,11 +26,11 @@
namespace parquet {
namespace arrow {
-#define PARQUET_CATCH_NOT_OK(s) \
- try { \
- (s); \
- } catch (const ::parquet::ParquetException& e) { \
- return ::arrow::Status::IOError(e.what()); \
+#define PARQUET_CATCH_NOT_OK(s) \
+ try { \
+ (s); \
+ } catch (const ::parquet::ParquetException& e) { \
+ return ::arrow::Status::IOError(e.what()); \
}
#define PARQUET_IGNORE_NOT_OK(s) \
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/d7625e9a/src/parquet/arrow/writer.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc
index e588c84..b7663a3 100644
--- a/src/parquet/arrow/writer.cc
+++ b/src/parquet/arrow/writer.cc
@@ -28,11 +28,11 @@
#include "arrow/api.h"
+using arrow::BinaryArray;
using arrow::MemoryPool;
using arrow::PoolBuffer;
using arrow::PrimitiveArray;
using arrow::Status;
-using arrow::StringArray;
using arrow::Table;
using parquet::ParquetFileWriter;
@@ -82,7 +82,7 @@ class FileWriter::Impl {
}
Status WriteFlatColumnChunk(const PrimitiveArray* data, int64_t offset, int64_t length);
- Status WriteFlatColumnChunk(const StringArray* data, int64_t offset, int64_t length);
+ Status WriteFlatColumnChunk(const BinaryArray* data, int64_t offset, int64_t length);
Status Close();
virtual ~Impl() {}
@@ -253,7 +253,7 @@ Status FileWriter::Impl::WriteFlatColumnChunk(
}
Status FileWriter::Impl::WriteFlatColumnChunk(
- const StringArray* data, int64_t offset, int64_t length) {
+ const BinaryArray* data, int64_t offset, int64_t length) {
ColumnWriter* column_writer;
PARQUET_CATCH_NOT_OK(column_writer = row_group_writer_->NextColumn());
DCHECK((offset + length) <= data->length());
@@ -312,10 +312,11 @@ Status FileWriter::WriteFlatColumnChunk(
const ::arrow::Array* array, int64_t offset, int64_t length) {
int64_t real_length = length;
if (length == -1) { real_length = array->length(); }
- if (array->type_enum() == ::arrow::Type::STRING) {
- auto string_array = dynamic_cast<const ::arrow::StringArray*>(array);
- DCHECK(string_array);
- return impl_->WriteFlatColumnChunk(string_array, offset, real_length);
+ if (array->type_enum() == ::arrow::Type::STRING ||
+ array->type_enum() == ::arrow::Type::BINARY) {
+ auto binary_array = static_cast<const ::arrow::BinaryArray*>(array);
+ DCHECK(binary_array);
+ return impl_->WriteFlatColumnChunk(binary_array, offset, real_length);
} else {
auto primitive_array = dynamic_cast<const PrimitiveArray*>(array);
if (!primitive_array) {
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/d7625e9a/src/parquet/column/column-writer-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/column-writer-test.cc b/src/parquet/column/column-writer-test.cc
index 5a65175..68d79d1 100644
--- a/src/parquet/column/column-writer-test.cc
+++ b/src/parquet/column/column-writer-test.cc
@@ -214,7 +214,8 @@ void TestPrimitiveWriter<FLBAType>::ReadColumnFully(Compression::type compressio
}
typedef ::testing::Types<Int32Type, Int64Type, Int96Type, FloatType, DoubleType,
- BooleanType, ByteArrayType, FLBAType> TestTypes;
+ BooleanType, ByteArrayType, FLBAType>
+ TestTypes;
TYPED_TEST_CASE(TestPrimitiveWriter, TestTypes);
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/d7625e9a/src/parquet/compression/brotli-codec.cc
----------------------------------------------------------------------
diff --git a/src/parquet/compression/brotli-codec.cc b/src/parquet/compression/brotli-codec.cc
index 24ff230..8118206 100644
--- a/src/parquet/compression/brotli-codec.cc
+++ b/src/parquet/compression/brotli-codec.cc
@@ -15,10 +15,10 @@
// specific language governing permissions and limitations
// under the License.
-#include <cstdint>
-#include <cstdlib>
#include <brotli/decode.h>
#include <brotli/encode.h>
+#include <cstdint>
+#include <cstdlib>
#include "parquet/compression/codec.h"
#include "parquet/exception.h"
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/d7625e9a/src/parquet/compression/codec.h
----------------------------------------------------------------------
diff --git a/src/parquet/compression/codec.h b/src/parquet/compression/codec.h
index e803a8c..abd4899 100644
--- a/src/parquet/compression/codec.h
+++ b/src/parquet/compression/codec.h
@@ -65,8 +65,8 @@ class BrotliCodec : public Codec {
void Decompress(int64_t input_len, const uint8_t* input, int64_t output_len,
uint8_t* output_buffer) override;
- int64_t Compress(int64_t input_len, const uint8_t* input,
- int64_t output_buffer_len, uint8_t* output_buffer) override;
+ int64_t Compress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len,
+ uint8_t* output_buffer) override;
int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override;
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/d7625e9a/src/parquet/schema/printer.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/printer.cc b/src/parquet/schema/printer.cc
index c4ab3e7..ca11244 100644
--- a/src/parquet/schema/printer.cc
+++ b/src/parquet/schema/printer.cc
@@ -96,9 +96,8 @@ static void PrintType(const PrimitiveNode* node, std::ostream& stream) {
static void PrintLogicalType(const PrimitiveNode* node, std::ostream& stream) {
auto lt = node->logical_type();
if (lt == LogicalType::DECIMAL) {
- stream << " (" << LogicalTypeToString(lt) << "(" <<
- node->decimal_metadata().precision << "," <<
- node->decimal_metadata().scale << "))";
+ stream << " (" << LogicalTypeToString(lt) << "(" << node->decimal_metadata().precision
+ << "," << node->decimal_metadata().scale << "))";
} else if (lt != LogicalType::NONE) {
stream << " (" << LogicalTypeToString(lt) << ")";
}
@@ -120,10 +119,8 @@ void SchemaPrinter::Visit(const GroupNode* node) {
PrintRepLevel(node->repetition(), stream_);
stream_ << " group " << node->name();
auto lt = node->logical_type();
- if (lt != LogicalType::NONE) {
- stream_ << " (" << LogicalTypeToString(lt) << ")";
- }
- stream_ << " {" << std::endl;
+ if (lt != LogicalType::NONE) { stream_ << " (" << LogicalTypeToString(lt) << ")"; }
+ stream_ << " {" << std::endl;
}
indent_ += indent_width_;
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/d7625e9a/src/parquet/schema/schema-printer-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/schema-printer-test.cc b/src/parquet/schema/schema-printer-test.cc
index e594f6f..29140f0 100644
--- a/src/parquet/schema/schema-printer-test.cc
+++ b/src/parquet/schema/schema-printer-test.cc
@@ -51,8 +51,8 @@ TEST(TestSchemaPrinter, Examples) {
NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list}));
fields.push_back(bag);
- fields.push_back(PrimitiveNode::Make("c", Repetition::REQUIRED, Type::INT32,
- LogicalType::DECIMAL, -1, 3, 2));
+ fields.push_back(PrimitiveNode::Make(
+ "c", Repetition::REQUIRED, Type::INT32, LogicalType::DECIMAL, -1, 3, 2));
NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, fields);
http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/d7625e9a/thirdparty/versions.sh
----------------------------------------------------------------------
diff --git a/thirdparty/versions.sh b/thirdparty/versions.sh
index c968c7b..4233669 100755
--- a/thirdparty/versions.sh
+++ b/thirdparty/versions.sh
@@ -15,7 +15,7 @@
# specific language governing permissions and limitations
# under the License.
-ARROW_VERSION="2c10d7ccec3c07fb061e1988be16aecaf9916af4"
+ARROW_VERSION="268ffbeffb1cd0617e52d381d500a2d10f61124c"
ARROW_URL="https://github.com/apache/arrow/archive/${ARROW_VERSION}.tar.gz"
ARROW_BASEDIR="arrow-${ARROW_VERSION}"