You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@parquet.apache.org by uw...@apache.org on 2016/12/21 13:55:29 UTC

parquet-cpp git commit: PARQUET-812: Read BYTE_ARRAY with no logical type as arrow::BinaryArray

Repository: parquet-cpp
Updated Branches:
  refs/heads/master 16466b109 -> d7625e9ad


PARQUET-812: Read BYTE_ARRAY with no logical type as arrow::BinaryArray

Depends on ARROW-374. Need to merge that and update the thirdparty git hash

Author: Wes McKinney <we...@twosigma.com>

Closes #206 from wesm/PARQUET-812 and squashes the following commits:

73fb8d0 [Wes McKinney] Update thirdparty arrow version
5db7908 [Wes McKinney] typo
fc0d559 [Wes McKinney] Read unadorned BYTE_ARRAY into arrow::BinaryArray


Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo
Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/d7625e9a
Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/d7625e9a
Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/d7625e9a

Branch: refs/heads/master
Commit: d7625e9ad6417485d6e8aa8432cf342a532ab3c3
Parents: 16466b1
Author: Wes McKinney <we...@twosigma.com>
Authored: Wed Dec 21 14:54:53 2016 +0100
Committer: Uwe L. Korn <uw...@xhochy.com>
Committed: Wed Dec 21 14:54:53 2016 +0100

----------------------------------------------------------------------
 src/parquet/arrow/arrow-reader-writer-test.cc | 15 ++++--
 src/parquet/arrow/arrow-schema-test.cc        | 29 +++++------
 src/parquet/arrow/reader.cc                   | 26 ++++++++--
 src/parquet/arrow/schema.cc                   | 58 ++++++++--------------
 src/parquet/arrow/test-util.h                 | 21 +++++---
 src/parquet/arrow/utils.h                     | 10 ++--
 src/parquet/arrow/writer.cc                   | 15 +++---
 src/parquet/column/column-writer-test.cc      |  3 +-
 src/parquet/compression/brotli-codec.cc       |  4 +-
 src/parquet/compression/codec.h               |  4 +-
 src/parquet/schema/printer.cc                 | 11 ++--
 src/parquet/schema/schema-printer-test.cc     |  4 +-
 thirdparty/versions.sh                        |  2 +-
 13 files changed, 108 insertions(+), 94 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/d7625e9a/src/parquet/arrow/arrow-reader-writer-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc
index a8a5db0..6d2b0d5 100644
--- a/src/parquet/arrow/arrow-reader-writer-test.cc
+++ b/src/parquet/arrow/arrow-reader-writer-test.cc
@@ -167,7 +167,15 @@ struct test_traits<::arrow::StringType> {
   static std::string const value;
 };
 
+template <>
+struct test_traits<::arrow::BinaryType> {
+  static constexpr ParquetType::type parquet_enum = ParquetType::BYTE_ARRAY;
+  static constexpr LogicalType::type logical_enum = LogicalType::NONE;
+  static std::string const value;
+};
+
 const std::string test_traits<::arrow::StringType>::value("Test");
+const std::string test_traits<::arrow::BinaryType>::value("\x00\x01\x02\x03");
 
 template <typename T>
 using ParquetDataType = DataType<test_traits<T>::parquet_enum>;
@@ -247,7 +255,7 @@ class TestParquetIO : public ::testing::Test {
   std::shared_ptr<InMemoryOutputStream> sink_;
 };
 
-// We habe separate tests for UInt32Type as this is currently the only type
+// We have separate tests for UInt32Type as this is currently the only type
 // where a roundtrip does not yield the identical Array structure.
 // There we write an UInt32 Array but receive an Int64 Array as result for
 // Parquet version 1.0.
@@ -255,7 +263,7 @@ class TestParquetIO : public ::testing::Test {
 typedef ::testing::Types<::arrow::BooleanType, ::arrow::UInt8Type, ::arrow::Int8Type,
     ::arrow::UInt16Type, ::arrow::Int16Type, ::arrow::Int32Type, ::arrow::UInt64Type,
     ::arrow::Int64Type, ::arrow::TimestampType, ::arrow::FloatType, ::arrow::DoubleType,
-    ::arrow::StringType>
+    ::arrow::StringType, ::arrow::BinaryType>
     TestTypes;
 
 TYPED_TEST_CASE(TestParquetIO, TestTypes);
@@ -504,8 +512,7 @@ using TestStringParquetIO = TestParquetIO<::arrow::StringType>;
 
 TEST_F(TestStringParquetIO, EmptyStringColumnRequiredWrite) {
   std::shared_ptr<Array> values;
-  ::arrow::StringBuilder builder(
-      ::arrow::default_memory_pool(), std::make_shared<::arrow::StringType>());
+  ::arrow::StringBuilder builder(::arrow::default_memory_pool(), ::arrow::utf8());
   for (size_t i = 0; i < SMALL_SIZE; i++) {
     builder.Append("");
   }

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/d7625e9a/src/parquet/arrow/arrow-schema-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/arrow-schema-test.cc b/src/parquet/arrow/arrow-schema-test.cc
index 3d07561..43e57d8 100644
--- a/src/parquet/arrow/arrow-schema-test.cc
+++ b/src/parquet/arrow/arrow-schema-test.cc
@@ -38,21 +38,19 @@ namespace parquet {
 
 namespace arrow {
 
-const auto BOOL = std::make_shared<::arrow::BooleanType>();
-const auto UINT8 = std::make_shared<::arrow::UInt8Type>();
-const auto INT32 = std::make_shared<::arrow::Int32Type>();
-const auto INT64 = std::make_shared<::arrow::Int64Type>();
-const auto FLOAT = std::make_shared<::arrow::FloatType>();
-const auto DOUBLE = std::make_shared<::arrow::DoubleType>();
-const auto UTF8 = std::make_shared<::arrow::StringType>();
-const auto TIMESTAMP_MS =
-    std::make_shared<::arrow::TimestampType>(::arrow::TimestampType::Unit::MILLI);
-const auto TIMESTAMP_NS =
-    std::make_shared<::arrow::TimestampType>(::arrow::TimestampType::Unit::NANO);
+const auto BOOL = ::arrow::boolean();
+const auto UINT8 = ::arrow::uint8();
+const auto INT32 = ::arrow::int32();
+const auto INT64 = ::arrow::int64();
+const auto FLOAT = ::arrow::float32();
+const auto DOUBLE = ::arrow::float64();
+const auto UTF8 = ::arrow::utf8();
+const auto TIMESTAMP_MS = ::arrow::timestamp(::arrow::TimeUnit::MILLI);
+const auto TIMESTAMP_NS = ::arrow::timestamp(::arrow::TimeUnit::NANO);
+
 // TODO: This requires parquet-cpp implementing the MICROS enum value
 // const auto TIMESTAMP_US = std::make_shared<TimestampType>(TimestampType::Unit::MICRO);
-const auto BINARY =
-    std::make_shared<::arrow::ListType>(std::make_shared<Field>("", UINT8));
+const auto BINARY = ::arrow::binary();
 const auto DECIMAL_8_4 = std::make_shared<::arrow::DecimalType>(8, 4);
 
 class TestConvertParquetSchema : public ::testing::Test {
@@ -412,11 +410,14 @@ TEST_F(TestConvertArrowSchema, ParquetFlatPrimitives) {
       PrimitiveNode::Make("double", Repetition::OPTIONAL, ParquetType::DOUBLE));
   arrow_fields.push_back(std::make_shared<Field>("double", DOUBLE));
 
-  // TODO: String types need to be clarified a bit more in the Arrow spec
   parquet_fields.push_back(PrimitiveNode::Make(
       "string", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, LogicalType::UTF8));
   arrow_fields.push_back(std::make_shared<Field>("string", UTF8));
 
+  parquet_fields.push_back(PrimitiveNode::Make(
+      "binary", Repetition::OPTIONAL, ParquetType::BYTE_ARRAY, LogicalType::NONE));
+  arrow_fields.push_back(std::make_shared<Field>("binary", BINARY));
+
   ASSERT_OK(ConvertSchema(arrow_fields));
 
   CheckFlatSchema(parquet_fields);

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/d7625e9a/src/parquet/arrow/reader.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/reader.cc b/src/parquet/arrow/reader.cc
index 2efa806..135867c 100644
--- a/src/parquet/arrow/reader.cc
+++ b/src/parquet/arrow/reader.cc
@@ -28,6 +28,7 @@
 #include "parquet/arrow/utils.h"
 
 #include "arrow/api.h"
+#include "arrow/type_traits.h"
 #include "arrow/util/bit-util.h"
 
 using arrow::Array;
@@ -90,10 +91,14 @@ class FlatColumnReader::Impl {
   virtual ~Impl() {}
 
   Status NextBatch(int batch_size, std::shared_ptr<Array>* out);
+
   template <typename ArrowType, typename ParquetType>
   Status TypedReadBatch(int batch_size, std::shared_ptr<Array>* out);
 
   template <typename ArrowType>
+  Status ReadByteArrayBatch(int batch_size, std::shared_ptr<Array>* out);
+
+  template <typename ArrowType>
   Status InitDataBuffer(int batch_size);
   template <typename ArrowType, typename ParquetType>
   void ReadNullableFlatBatch(const int16_t* def_levels,
@@ -486,11 +491,13 @@ Status FlatColumnReader::Impl::TypedReadBatch<::arrow::BooleanType, BooleanType>
   }
 }
 
-template <>
-Status FlatColumnReader::Impl::TypedReadBatch<::arrow::StringType, ByteArrayType>(
+template <typename ArrowType>
+Status FlatColumnReader::Impl::ReadByteArrayBatch(
     int batch_size, std::shared_ptr<Array>* out) {
+  using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
+
   int values_to_read = batch_size;
-  ::arrow::StringBuilder builder(pool_, field_->type);
+  BuilderType builder(pool_, field_->type);
   while ((values_to_read > 0) && column_reader_) {
     values_buffer_.Resize(values_to_read * sizeof(ByteArray));
     if (descr_->max_definition_level() > 0) {
@@ -528,6 +535,18 @@ Status FlatColumnReader::Impl::TypedReadBatch<::arrow::StringType, ByteArrayType
   return builder.Finish(out);
 }
 
+template <>
+Status FlatColumnReader::Impl::TypedReadBatch<::arrow::BinaryType, ByteArrayType>(
+    int batch_size, std::shared_ptr<Array>* out) {
+  return ReadByteArrayBatch<::arrow::BinaryType>(batch_size, out);
+}
+
+template <>
+Status FlatColumnReader::Impl::TypedReadBatch<::arrow::StringType, ByteArrayType>(
+    int batch_size, std::shared_ptr<Array>* out) {
+  return ReadByteArrayBatch<::arrow::StringType>(batch_size, out);
+}
+
 #define TYPED_BATCH_CASE(ENUM, ArrowType, ParquetType)              \
   case ::arrow::Type::ENUM:                                         \
     return TypedReadBatch<ArrowType, ParquetType>(batch_size, out); \
@@ -553,6 +572,7 @@ Status FlatColumnReader::Impl::NextBatch(int batch_size, std::shared_ptr<Array>*
     TYPED_BATCH_CASE(FLOAT, ::arrow::FloatType, FloatType)
     TYPED_BATCH_CASE(DOUBLE, ::arrow::DoubleType, DoubleType)
     TYPED_BATCH_CASE(STRING, ::arrow::StringType, ByteArrayType)
+    TYPED_BATCH_CASE(BINARY, ::arrow::BinaryType, ByteArrayType)
     case ::arrow::Type::TIMESTAMP: {
       ::arrow::TimestampType* timestamp_type =
           static_cast<::arrow::TimestampType*>(field_->type.get());

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/d7625e9a/src/parquet/arrow/schema.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/schema.cc b/src/parquet/arrow/schema.cc
index e578ec2..3e5e7d9 100644
--- a/src/parquet/arrow/schema.cc
+++ b/src/parquet/arrow/schema.cc
@@ -44,24 +44,8 @@ namespace parquet {
 
 namespace arrow {
 
-const auto BOOL = std::make_shared<::arrow::BooleanType>();
-const auto UINT8 = std::make_shared<::arrow::UInt8Type>();
-const auto INT8 = std::make_shared<::arrow::Int8Type>();
-const auto UINT16 = std::make_shared<::arrow::UInt16Type>();
-const auto INT16 = std::make_shared<::arrow::Int16Type>();
-const auto UINT32 = std::make_shared<::arrow::UInt32Type>();
-const auto INT32 = std::make_shared<::arrow::Int32Type>();
-const auto UINT64 = std::make_shared<::arrow::UInt64Type>();
-const auto INT64 = std::make_shared<::arrow::Int64Type>();
-const auto FLOAT = std::make_shared<::arrow::FloatType>();
-const auto DOUBLE = std::make_shared<::arrow::DoubleType>();
-const auto UTF8 = std::make_shared<::arrow::StringType>();
-const auto TIMESTAMP_MS =
-    std::make_shared<::arrow::TimestampType>(::arrow::TimestampType::Unit::MILLI);
-const auto TIMESTAMP_NS =
-    std::make_shared<::arrow::TimestampType>(::arrow::TimestampType::Unit::NANO);
-const auto BINARY =
-    std::make_shared<::arrow::ListType>(std::make_shared<::arrow::Field>("", UINT8));
+const auto TIMESTAMP_MS = ::arrow::timestamp(::arrow::TimeUnit::MILLI);
+const auto TIMESTAMP_NS = ::arrow::timestamp(::arrow::TimeUnit::NANO);
 
 TypePtr MakeDecimalType(const PrimitiveNode* node) {
   int precision = node->decimal_metadata().precision;
@@ -72,14 +56,14 @@ TypePtr MakeDecimalType(const PrimitiveNode* node) {
 static Status FromByteArray(const PrimitiveNode* node, TypePtr* out) {
   switch (node->logical_type()) {
     case LogicalType::UTF8:
-      *out = UTF8;
+      *out = ::arrow::utf8();
       break;
     case LogicalType::DECIMAL:
       *out = MakeDecimalType(node);
       break;
     default:
       // BINARY
-      *out = BINARY;
+      *out = ::arrow::binary();
       break;
   }
   return Status::OK();
@@ -88,7 +72,7 @@ static Status FromByteArray(const PrimitiveNode* node, TypePtr* out) {
 static Status FromFLBA(const PrimitiveNode* node, TypePtr* out) {
   switch (node->logical_type()) {
     case LogicalType::NONE:
-      *out = BINARY;
+      *out = ::arrow::binary();
       break;
     case LogicalType::DECIMAL:
       *out = MakeDecimalType(node);
@@ -104,22 +88,22 @@ static Status FromFLBA(const PrimitiveNode* node, TypePtr* out) {
 static Status FromInt32(const PrimitiveNode* node, TypePtr* out) {
   switch (node->logical_type()) {
     case LogicalType::NONE:
-      *out = INT32;
+      *out = ::arrow::int32();
       break;
     case LogicalType::UINT_8:
-      *out = UINT8;
+      *out = ::arrow::uint8();
       break;
     case LogicalType::INT_8:
-      *out = INT8;
+      *out = ::arrow::int8();
       break;
     case LogicalType::UINT_16:
-      *out = UINT16;
+      *out = ::arrow::uint16();
       break;
     case LogicalType::INT_16:
-      *out = INT16;
+      *out = ::arrow::int16();
       break;
     case LogicalType::UINT_32:
-      *out = UINT32;
+      *out = ::arrow::uint32();
       break;
     case LogicalType::DECIMAL:
       *out = MakeDecimalType(node);
@@ -134,10 +118,10 @@ static Status FromInt32(const PrimitiveNode* node, TypePtr* out) {
 static Status FromInt64(const PrimitiveNode* node, TypePtr* out) {
   switch (node->logical_type()) {
     case LogicalType::NONE:
-      *out = INT64;
+      *out = ::arrow::int64();
       break;
     case LogicalType::UINT_64:
-      *out = UINT64;
+      *out = ::arrow::uint64();
       break;
     case LogicalType::DECIMAL:
       *out = MakeDecimalType(node);
@@ -155,7 +139,7 @@ static Status FromInt64(const PrimitiveNode* node, TypePtr* out) {
 Status FromPrimitive(const PrimitiveNode* primitive, TypePtr* out) {
   switch (primitive->physical_type()) {
     case ParquetType::BOOLEAN:
-      *out = BOOL;
+      *out = ::arrow::boolean();
       break;
     case ParquetType::INT32:
       RETURN_NOT_OK(FromInt32(primitive, out));
@@ -167,13 +151,12 @@ Status FromPrimitive(const PrimitiveNode* primitive, TypePtr* out) {
       *out = TIMESTAMP_NS;
       break;
     case ParquetType::FLOAT:
-      *out = FLOAT;
+      *out = ::arrow::float32();
       break;
     case ParquetType::DOUBLE:
-      *out = DOUBLE;
+      *out = ::arrow::float64();
       break;
     case ParquetType::BYTE_ARRAY:
-      // TODO: Do we have that type in Arrow?
       RETURN_NOT_OK(FromByteArray(primitive, out));
       break;
     case ParquetType::FIXED_LEN_BYTE_ARRAY:
@@ -211,13 +194,13 @@ Status NodeToList(const GroupNode* group, TypePtr* out) {
         // List of primitive type
         std::shared_ptr<Field> item_field;
         RETURN_NOT_OK(NodeToField(list_group->field(0), &item_field));
-        *out = std::make_shared<::arrow::ListType>(item_field);
+        *out = ::arrow::list(item_field);
       } else {
         // List of struct
         std::shared_ptr<::arrow::DataType> inner_type;
         RETURN_NOT_OK(StructFromGroup(list_group, &inner_type));
         auto item_field = std::make_shared<Field>(list_node->name(), inner_type, false);
-        *out = std::make_shared<::arrow::ListType>(item_field);
+        *out = ::arrow::list(item_field);
       }
     } else if (list_node->is_repeated()) {
       // repeated primitive node
@@ -225,7 +208,7 @@ Status NodeToList(const GroupNode* group, TypePtr* out) {
       const PrimitiveNode* primitive = static_cast<const PrimitiveNode*>(list_node.get());
       RETURN_NOT_OK(FromPrimitive(primitive, &inner_type));
       auto item_field = std::make_shared<Field>(list_node->name(), inner_type, false);
-      *out = std::make_shared<::arrow::ListType>(item_field);
+      *out = ::arrow::list(item_field);
     } else {
       return Status::NotImplemented(
           "Non-repeated groups in a LIST-annotated group are not supported.");
@@ -247,7 +230,7 @@ Status NodeToField(const NodePtr& node, std::shared_ptr<Field>* out) {
     const PrimitiveNode* primitive = static_cast<const PrimitiveNode*>(node.get());
     RETURN_NOT_OK(FromPrimitive(primitive, &inner_type));
     auto item_field = std::make_shared<Field>(node->name(), inner_type, false);
-    type = std::make_shared<::arrow::ListType>(item_field);
+    type = ::arrow::list(item_field);
     nullable = false;
   } else if (node->is_group()) {
     const GroupNode* group = static_cast<const GroupNode*>(node.get());
@@ -423,5 +406,4 @@ Status ToParquetSchema(const ::arrow::Schema* arrow_schema,
 }
 
 }  // namespace arrow
-
 }  // namespace parquet

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/d7625e9a/src/parquet/arrow/test-util.h
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/test-util.h b/src/parquet/arrow/test-util.h
index bdb2bab..f996c2c 100644
--- a/src/parquet/arrow/test-util.h
+++ b/src/parquet/arrow/test-util.h
@@ -37,6 +37,9 @@ template <typename ArrowType>
 using is_arrow_string = std::is_same<ArrowType, ::arrow::StringType>;
 
 template <typename ArrowType>
+using is_arrow_binary = std::is_same<ArrowType, ::arrow::BinaryType>;
+
+template <typename ArrowType>
 using is_arrow_bool = std::is_same<ArrowType, ::arrow::BooleanType>;
 
 template <class ArrowType>
@@ -62,10 +65,11 @@ typename std::enable_if<is_arrow_int<ArrowType>::value, Status>::type NonNullArr
 }
 
 template <class ArrowType>
-typename std::enable_if<is_arrow_string<ArrowType>::value, Status>::type NonNullArray(
-    size_t size, std::shared_ptr<Array>* out) {
-  ::arrow::StringBuilder builder(
-      ::arrow::default_memory_pool(), std::make_shared<::arrow::StringType>());
+typename std::enable_if<
+    is_arrow_string<ArrowType>::value || is_arrow_binary<ArrowType>::value, Status>::type
+NonNullArray(size_t size, std::shared_ptr<Array>* out) {
+  using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
+  BuilderType builder(::arrow::default_memory_pool(), std::make_shared<ArrowType>());
   for (size_t i = 0; i < size; i++) {
     builder.Append("test-string");
   }
@@ -121,16 +125,17 @@ typename std::enable_if<is_arrow_int<ArrowType>::value, Status>::type NullableAr
 
 // This helper function only supports (size/2) nulls yet.
 template <typename ArrowType>
-typename std::enable_if<is_arrow_string<ArrowType>::value, Status>::type NullableArray(
-    size_t size, size_t num_nulls, std::shared_ptr<::arrow::Array>* out) {
+typename std::enable_if<
+    is_arrow_string<ArrowType>::value || is_arrow_binary<ArrowType>::value, Status>::type
+NullableArray(size_t size, size_t num_nulls, std::shared_ptr<::arrow::Array>* out) {
   std::vector<uint8_t> valid_bytes(size, 1);
 
   for (size_t i = 0; i < num_nulls; i++) {
     valid_bytes[i * 2] = 0;
   }
 
-  ::arrow::StringBuilder builder(
-      ::arrow::default_memory_pool(), std::make_shared<::arrow::StringType>());
+  using BuilderType = typename ::arrow::TypeTraits<ArrowType>::BuilderType;
+  BuilderType builder(::arrow::default_memory_pool(), std::make_shared<ArrowType>());
   for (size_t i = 0; i < size; i++) {
     builder.Append("test-string");
   }

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/d7625e9a/src/parquet/arrow/utils.h
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/utils.h b/src/parquet/arrow/utils.h
index 015421b..9c2abfa 100644
--- a/src/parquet/arrow/utils.h
+++ b/src/parquet/arrow/utils.h
@@ -26,11 +26,11 @@
 namespace parquet {
 namespace arrow {
 
-#define PARQUET_CATCH_NOT_OK(s)                     \
-  try {                                             \
-    (s);                                            \
-  } catch (const ::parquet::ParquetException& e) {  \
-    return ::arrow::Status::IOError(e.what());      \
+#define PARQUET_CATCH_NOT_OK(s)                    \
+  try {                                            \
+    (s);                                           \
+  } catch (const ::parquet::ParquetException& e) { \
+    return ::arrow::Status::IOError(e.what());     \
   }
 
 #define PARQUET_IGNORE_NOT_OK(s) \

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/d7625e9a/src/parquet/arrow/writer.cc
----------------------------------------------------------------------
diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc
index e588c84..b7663a3 100644
--- a/src/parquet/arrow/writer.cc
+++ b/src/parquet/arrow/writer.cc
@@ -28,11 +28,11 @@
 
 #include "arrow/api.h"
 
+using arrow::BinaryArray;
 using arrow::MemoryPool;
 using arrow::PoolBuffer;
 using arrow::PrimitiveArray;
 using arrow::Status;
-using arrow::StringArray;
 using arrow::Table;
 
 using parquet::ParquetFileWriter;
@@ -82,7 +82,7 @@ class FileWriter::Impl {
   }
 
   Status WriteFlatColumnChunk(const PrimitiveArray* data, int64_t offset, int64_t length);
-  Status WriteFlatColumnChunk(const StringArray* data, int64_t offset, int64_t length);
+  Status WriteFlatColumnChunk(const BinaryArray* data, int64_t offset, int64_t length);
   Status Close();
 
   virtual ~Impl() {}
@@ -253,7 +253,7 @@ Status FileWriter::Impl::WriteFlatColumnChunk(
 }
 
 Status FileWriter::Impl::WriteFlatColumnChunk(
-    const StringArray* data, int64_t offset, int64_t length) {
+    const BinaryArray* data, int64_t offset, int64_t length) {
   ColumnWriter* column_writer;
   PARQUET_CATCH_NOT_OK(column_writer = row_group_writer_->NextColumn());
   DCHECK((offset + length) <= data->length());
@@ -312,10 +312,11 @@ Status FileWriter::WriteFlatColumnChunk(
     const ::arrow::Array* array, int64_t offset, int64_t length) {
   int64_t real_length = length;
   if (length == -1) { real_length = array->length(); }
-  if (array->type_enum() == ::arrow::Type::STRING) {
-    auto string_array = dynamic_cast<const ::arrow::StringArray*>(array);
-    DCHECK(string_array);
-    return impl_->WriteFlatColumnChunk(string_array, offset, real_length);
+  if (array->type_enum() == ::arrow::Type::STRING ||
+      array->type_enum() == ::arrow::Type::BINARY) {
+    auto binary_array = static_cast<const ::arrow::BinaryArray*>(array);
+    DCHECK(binary_array);
+    return impl_->WriteFlatColumnChunk(binary_array, offset, real_length);
   } else {
     auto primitive_array = dynamic_cast<const PrimitiveArray*>(array);
     if (!primitive_array) {

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/d7625e9a/src/parquet/column/column-writer-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/column/column-writer-test.cc b/src/parquet/column/column-writer-test.cc
index 5a65175..68d79d1 100644
--- a/src/parquet/column/column-writer-test.cc
+++ b/src/parquet/column/column-writer-test.cc
@@ -214,7 +214,8 @@ void TestPrimitiveWriter<FLBAType>::ReadColumnFully(Compression::type compressio
 }
 
 typedef ::testing::Types<Int32Type, Int64Type, Int96Type, FloatType, DoubleType,
-    BooleanType, ByteArrayType, FLBAType> TestTypes;
+    BooleanType, ByteArrayType, FLBAType>
+    TestTypes;
 
 TYPED_TEST_CASE(TestPrimitiveWriter, TestTypes);
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/d7625e9a/src/parquet/compression/brotli-codec.cc
----------------------------------------------------------------------
diff --git a/src/parquet/compression/brotli-codec.cc b/src/parquet/compression/brotli-codec.cc
index 24ff230..8118206 100644
--- a/src/parquet/compression/brotli-codec.cc
+++ b/src/parquet/compression/brotli-codec.cc
@@ -15,10 +15,10 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include <cstdint>
-#include <cstdlib>
 #include <brotli/decode.h>
 #include <brotli/encode.h>
+#include <cstdint>
+#include <cstdlib>
 
 #include "parquet/compression/codec.h"
 #include "parquet/exception.h"

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/d7625e9a/src/parquet/compression/codec.h
----------------------------------------------------------------------
diff --git a/src/parquet/compression/codec.h b/src/parquet/compression/codec.h
index e803a8c..abd4899 100644
--- a/src/parquet/compression/codec.h
+++ b/src/parquet/compression/codec.h
@@ -65,8 +65,8 @@ class BrotliCodec : public Codec {
   void Decompress(int64_t input_len, const uint8_t* input, int64_t output_len,
       uint8_t* output_buffer) override;
 
-  int64_t Compress(int64_t input_len, const uint8_t* input,
-      int64_t output_buffer_len, uint8_t* output_buffer) override;
+  int64_t Compress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len,
+      uint8_t* output_buffer) override;
 
   int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override;
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/d7625e9a/src/parquet/schema/printer.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/printer.cc b/src/parquet/schema/printer.cc
index c4ab3e7..ca11244 100644
--- a/src/parquet/schema/printer.cc
+++ b/src/parquet/schema/printer.cc
@@ -96,9 +96,8 @@ static void PrintType(const PrimitiveNode* node, std::ostream& stream) {
 static void PrintLogicalType(const PrimitiveNode* node, std::ostream& stream) {
   auto lt = node->logical_type();
   if (lt == LogicalType::DECIMAL) {
-    stream << " (" <<  LogicalTypeToString(lt) << "(" <<
-      node->decimal_metadata().precision << "," <<
-      node->decimal_metadata().scale << "))";
+    stream << " (" << LogicalTypeToString(lt) << "(" << node->decimal_metadata().precision
+           << "," << node->decimal_metadata().scale << "))";
   } else if (lt != LogicalType::NONE) {
     stream << " (" << LogicalTypeToString(lt) << ")";
   }
@@ -120,10 +119,8 @@ void SchemaPrinter::Visit(const GroupNode* node) {
     PrintRepLevel(node->repetition(), stream_);
     stream_ << " group " << node->name();
     auto lt = node->logical_type();
-    if (lt != LogicalType::NONE) {
-      stream_ << " (" << LogicalTypeToString(lt) << ")";
-    }
-    stream_  << " {" << std::endl;
+    if (lt != LogicalType::NONE) { stream_ << " (" << LogicalTypeToString(lt) << ")"; }
+    stream_ << " {" << std::endl;
   }
 
   indent_ += indent_width_;

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/d7625e9a/src/parquet/schema/schema-printer-test.cc
----------------------------------------------------------------------
diff --git a/src/parquet/schema/schema-printer-test.cc b/src/parquet/schema/schema-printer-test.cc
index e594f6f..29140f0 100644
--- a/src/parquet/schema/schema-printer-test.cc
+++ b/src/parquet/schema/schema-printer-test.cc
@@ -51,8 +51,8 @@ TEST(TestSchemaPrinter, Examples) {
   NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list}));
   fields.push_back(bag);
 
-  fields.push_back(PrimitiveNode::Make("c", Repetition::REQUIRED, Type::INT32,
-                                       LogicalType::DECIMAL, -1, 3, 2));
+  fields.push_back(PrimitiveNode::Make(
+      "c", Repetition::REQUIRED, Type::INT32, LogicalType::DECIMAL, -1, 3, 2));
 
   NodePtr schema = GroupNode::Make("schema", Repetition::REPEATED, fields);
 

http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/d7625e9a/thirdparty/versions.sh
----------------------------------------------------------------------
diff --git a/thirdparty/versions.sh b/thirdparty/versions.sh
index c968c7b..4233669 100755
--- a/thirdparty/versions.sh
+++ b/thirdparty/versions.sh
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-ARROW_VERSION="2c10d7ccec3c07fb061e1988be16aecaf9916af4"
+ARROW_VERSION="268ffbeffb1cd0617e52d381d500a2d10f61124c"
 ARROW_URL="https://github.com/apache/arrow/archive/${ARROW_VERSION}.tar.gz"
 ARROW_BASEDIR="arrow-${ARROW_VERSION}"