You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/01/06 16:11:51 UTC

arrow git commit: ARROW-427: [C++] Implement dictionary array type

Repository: arrow
Updated Branches:
  refs/heads/master 5bf6ae49e -> 74685f386


ARROW-427: [C++] Implement dictionary array type

I thought some about this and thought that it made sense to store the reference to the dictionary values themselves in the data type object, similar to `CategoricalDtype` in pandas. This will be at least adequate for the Feather file format merge.

In the IPC metadata, there is no explicit dictionary type -- an array can be dictionary encoded or not. On JIRA we've discussed adding a dictionary type flag indicating whether or not the dictionary values/categories are ordered (also called "ordinal") or unordered (also called "nominal"). That hasn't been done yet.

Author: Wes McKinney <we...@twosigma.com>

Closes #268 from wesm/ARROW-427 and squashes the following commits:

5ce3701 [Wes McKinney] cpplint
a6c2896 [Wes McKinney] Revert T::Equals(const T& other) to EqualsExact to appease clang
9a4edb5 [Wes McKinney] Implement rudimentary DictionaryArray::Validate
9efe46b [Wes McKinney] Add tests, implementation for DictionaryArray::Equals and RangeEquals
b06eb86 [Wes McKinney] Implement PrettyPrint for DictionaryArray
17c70de [Wes McKinney] Refactor, compose shared_ptr<DataType> in DictionaryType
b52b3a7 [Wes McKinney] Add rudimentary DictionaryType and DictionaryArray implementation for discussion


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/74685f38
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/74685f38
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/74685f38

Branch: refs/heads/master
Commit: 74685f386307171a90a9f97316e25b7f39cdd0a1
Parents: 5bf6ae4
Author: Wes McKinney <we...@twosigma.com>
Authored: Fri Jan 6 11:11:43 2017 -0500
Committer: Wes McKinney <we...@twosigma.com>
Committed: Fri Jan 6 11:11:43 2017 -0500

----------------------------------------------------------------------
 cpp/src/arrow/CMakeLists.txt           |   1 +
 cpp/src/arrow/array-dictionary-test.cc | 128 ++++++++++++++++++++++
 cpp/src/arrow/array-string-test.cc     |   4 +-
 cpp/src/arrow/array.cc                 |  94 +++++++++++++---
 cpp/src/arrow/array.h                  | 111 ++++++++++++++++---
 cpp/src/arrow/ipc/adapter.cc           |  11 ++
 cpp/src/arrow/ipc/json-internal.cc     |  13 +++
 cpp/src/arrow/pretty_print-test.cc     |  53 +++++----
 cpp/src/arrow/pretty_print.cc          |  12 ++
 cpp/src/arrow/test-util.h              |  36 +++---
 cpp/src/arrow/type.cc                  |  69 ++++++++++--
 cpp/src/arrow/type.h                   | 163 +++++++++++++++++++++-------
 cpp/src/arrow/type_fwd.h               |  57 +---------
 format/Message.fbs                     |   2 +-
 python/pyarrow/includes/libarrow.pxd   |   3 +-
 python/pyarrow/includes/parquet.pxd    |   2 +-
 python/pyarrow/parquet.pyx             |   4 +-
 python/pyarrow/schema.pyx              |   4 +-
 18 files changed, 583 insertions(+), 184 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/74685f38/cpp/src/arrow/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 16668db..e5e36ed 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -47,6 +47,7 @@ install(
 
 ADD_ARROW_TEST(array-test)
 ADD_ARROW_TEST(array-decimal-test)
+ADD_ARROW_TEST(array-dictionary-test)
 ADD_ARROW_TEST(array-list-test)
 ADD_ARROW_TEST(array-primitive-test)
 ADD_ARROW_TEST(array-string-test)

http://git-wip-us.apache.org/repos/asf/arrow/blob/74685f38/cpp/src/arrow/array-dictionary-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-dictionary-test.cc b/cpp/src/arrow/array-dictionary-test.cc
new file mode 100644
index 0000000..c290153
--- /dev/null
+++ b/cpp/src/arrow/array-dictionary-test.cc
@@ -0,0 +1,128 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+#include <numeric>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "arrow/array.h"
+#include "arrow/buffer.h"
+#include "arrow/memory_pool.h"
+#include "arrow/test-util.h"
+#include "arrow/type.h"
+
+namespace arrow {
+
+TEST(TestDictionary, Basics) {
+  std::vector<int32_t> values = {100, 1000, 10000, 100000};
+  std::shared_ptr<Array> dict;
+  ArrayFromVector<Int32Type, int32_t>(int32(), values, &dict);
+
+  std::shared_ptr<DictionaryType> type1 =
+      std::dynamic_pointer_cast<DictionaryType>(dictionary(int16(), dict));
+  DictionaryType type2(int16(), dict);
+
+  ASSERT_TRUE(int16()->Equals(type1->index_type()));
+  ASSERT_TRUE(type1->dictionary()->Equals(dict));
+
+  ASSERT_TRUE(int16()->Equals(type2.index_type()));
+  ASSERT_TRUE(type2.dictionary()->Equals(dict));
+
+  ASSERT_EQ("dictionary<int32, int16>", type1->ToString());
+}
+
+TEST(TestDictionary, Equals) {
+  std::vector<bool> is_valid = {true, true, false, true, true, true};
+
+  std::shared_ptr<Array> dict;
+  std::vector<std::string> dict_values = {"foo", "bar", "baz"};
+  ArrayFromVector<StringType, std::string>(utf8(), dict_values, &dict);
+  std::shared_ptr<DataType> dict_type = dictionary(int16(), dict);
+
+  std::shared_ptr<Array> dict2;
+  std::vector<std::string> dict2_values = {"foo", "bar", "baz", "qux"};
+  ArrayFromVector<StringType, std::string>(utf8(), dict2_values, &dict2);
+  std::shared_ptr<DataType> dict2_type = dictionary(int16(), dict2);
+
+  std::shared_ptr<Array> indices;
+  std::vector<int16_t> indices_values = {1, 2, -1, 0, 2, 0};
+  ArrayFromVector<Int16Type, int16_t>(int16(), is_valid, indices_values, &indices);
+
+  std::shared_ptr<Array> indices2;
+  std::vector<int16_t> indices2_values = {1, 2, 0, 0, 2, 0};
+  ArrayFromVector<Int16Type, int16_t>(int16(), is_valid, indices2_values, &indices2);
+
+  std::shared_ptr<Array> indices3;
+  std::vector<int16_t> indices3_values = {1, 1, 0, 0, 2, 0};
+  ArrayFromVector<Int16Type, int16_t>(int16(), is_valid, indices3_values, &indices3);
+
+  auto arr = std::make_shared<DictionaryArray>(dict_type, indices);
+  auto arr2 = std::make_shared<DictionaryArray>(dict_type, indices2);
+  auto arr3 = std::make_shared<DictionaryArray>(dict2_type, indices);
+  auto arr4 = std::make_shared<DictionaryArray>(dict_type, indices3);
+
+  ASSERT_TRUE(arr->Equals(arr));
+
+  // Equal, because the unequal index is masked by null
+  ASSERT_TRUE(arr->Equals(arr2));
+
+  // Unequal dictionaries
+  ASSERT_FALSE(arr->Equals(arr3));
+
+  // Unequal indices
+  ASSERT_FALSE(arr->Equals(arr4));
+
+  // RangeEquals
+  ASSERT_TRUE(arr->RangeEquals(3, 6, 3, arr4));
+  ASSERT_FALSE(arr->RangeEquals(1, 3, 1, arr4));
+}
+
+TEST(TestDictionary, Validate) {
+  std::vector<bool> is_valid = {true, true, false, true, true, true};
+
+  std::shared_ptr<Array> dict;
+  std::vector<std::string> dict_values = {"foo", "bar", "baz"};
+  ArrayFromVector<StringType, std::string>(utf8(), dict_values, &dict);
+  std::shared_ptr<DataType> dict_type = dictionary(int16(), dict);
+
+  std::shared_ptr<Array> indices;
+  std::vector<uint8_t> indices_values = {1, 2, 0, 0, 2, 0};
+  ArrayFromVector<UInt8Type, uint8_t>(uint8(), is_valid, indices_values, &indices);
+
+  std::shared_ptr<Array> indices2;
+  std::vector<float> indices2_values = {1., 2., 0., 0., 2., 0.};
+  ArrayFromVector<FloatType, float>(float32(), is_valid, indices2_values, &indices2);
+
+  std::shared_ptr<Array> indices3;
+  std::vector<int64_t> indices3_values = {1, 2, 0, 0, 2, 0};
+  ArrayFromVector<Int64Type, int64_t>(int64(), is_valid, indices3_values, &indices3);
+
+  std::shared_ptr<Array> arr = std::make_shared<DictionaryArray>(dict_type, indices);
+  std::shared_ptr<Array> arr2 = std::make_shared<DictionaryArray>(dict_type, indices2);
+  std::shared_ptr<Array> arr3 = std::make_shared<DictionaryArray>(dict_type, indices3);
+
+  // Only checking index type for now
+  ASSERT_OK(arr->Validate());
+  ASSERT_RAISES(Invalid, arr2->Validate());
+  ASSERT_OK(arr3->Validate());
+}
+
+}  // namespace arrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/74685f38/cpp/src/arrow/array-string-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-string-test.cc b/cpp/src/arrow/array-string-test.cc
index b144c63..024bfd5 100644
--- a/cpp/src/arrow/array-string-test.cc
+++ b/cpp/src/arrow/array-string-test.cc
@@ -36,8 +36,8 @@ TEST(TypesTest, BinaryType) {
   BinaryType t1;
   BinaryType e1;
   StringType t2;
-  EXPECT_TRUE(t1.Equals(&e1));
-  EXPECT_FALSE(t1.Equals(&t2));
+  EXPECT_TRUE(t1.Equals(e1));
+  EXPECT_FALSE(t1.Equals(t2));
   ASSERT_EQ(t1.type, Type::BINARY);
   ASSERT_EQ(t1.ToString(), std::string("binary"));
 }

http://git-wip-us.apache.org/repos/asf/arrow/blob/74685f38/cpp/src/arrow/array.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index 3d309b8..7509520 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -42,7 +42,7 @@ Status GetEmptyBitmap(
 // ----------------------------------------------------------------------
 // Base array class
 
-Array::Array(const TypePtr& type, int32_t length, int32_t null_count,
+Array::Array(const std::shared_ptr<DataType>& type, int32_t length, int32_t null_count,
     const std::shared_ptr<Buffer>& null_bitmap) {
   type_ = type;
   length_ = length;
@@ -51,6 +51,12 @@ Array::Array(const TypePtr& type, int32_t length, int32_t null_count,
   if (null_bitmap_) { null_bitmap_data_ = null_bitmap_->data(); }
 }
 
+bool Array::BaseEquals(const std::shared_ptr<Array>& other) const {
+  if (this == other.get()) { return true; }
+  if (!other) { return false; }
+  return EqualsExact(*other.get());
+}
+
 bool Array::EqualsExact(const Array& other) const {
   if (this == &other) { return true; }
   if (length_ != other.length_ || null_count_ != other.null_count_ ||
@@ -91,7 +97,7 @@ Status NullArray::Accept(ArrayVisitor* visitor) const {
 // ----------------------------------------------------------------------
 // Primitive array base
 
-PrimitiveArray::PrimitiveArray(const TypePtr& type, int32_t length,
+PrimitiveArray::PrimitiveArray(const std::shared_ptr<DataType>& type, int32_t length,
     const std::shared_ptr<Buffer>& data, int32_t null_count,
     const std::shared_ptr<Buffer>& null_bitmap)
     : Array(type, length, null_count, null_bitmap) {
@@ -100,14 +106,9 @@ PrimitiveArray::PrimitiveArray(const TypePtr& type, int32_t length,
 }
 
 bool PrimitiveArray::EqualsExact(const PrimitiveArray& other) const {
-  if (this == &other) { return true; }
-  if (null_count_ != other.null_count_) { return false; }
+  if (!Array::EqualsExact(other)) { return false; }
 
   if (null_count_ > 0) {
-    bool equal_bitmap =
-        null_bitmap_->Equals(*other.null_bitmap_, BitUtil::CeilByte(length_) / 8);
-    if (!equal_bitmap) { return false; }
-
     const uint8_t* this_data = raw_data_;
     const uint8_t* other_data = other.raw_data_;
 
@@ -131,7 +132,7 @@ bool PrimitiveArray::Equals(const std::shared_ptr<Array>& arr) const {
   if (this == arr.get()) { return true; }
   if (!arr) { return false; }
   if (this->type_enum() != arr->type_enum()) { return false; }
-  return EqualsExact(*static_cast<const PrimitiveArray*>(arr.get()));
+  return EqualsExact(static_cast<const PrimitiveArray&>(*arr.get()));
 }
 
 template <typename T>
@@ -161,7 +162,7 @@ BooleanArray::BooleanArray(int32_t length, const std::shared_ptr<Buffer>& data,
     : PrimitiveArray(
           std::make_shared<BooleanType>(), length, data, null_count, null_bitmap) {}
 
-BooleanArray::BooleanArray(const TypePtr& type, int32_t length,
+BooleanArray::BooleanArray(const std::shared_ptr<DataType>& type, int32_t length,
     const std::shared_ptr<Buffer>& data, int32_t null_count,
     const std::shared_ptr<Buffer>& null_bitmap)
     : PrimitiveArray(type, length, data, null_count, null_bitmap) {}
@@ -192,7 +193,7 @@ bool BooleanArray::EqualsExact(const BooleanArray& other) const {
 bool BooleanArray::Equals(const std::shared_ptr<Array>& arr) const {
   if (this == arr.get()) return true;
   if (Type::BOOL != arr->type_enum()) { return false; }
-  return EqualsExact(*static_cast<const BooleanArray*>(arr.get()));
+  return EqualsExact(static_cast<const BooleanArray&>(*arr.get()));
 }
 
 bool BooleanArray::RangeEquals(int32_t start_idx, int32_t end_idx,
@@ -238,7 +239,7 @@ bool ListArray::EqualsExact(const ListArray& other) const {
 bool ListArray::Equals(const std::shared_ptr<Array>& arr) const {
   if (this == arr.get()) { return true; }
   if (this->type_enum() != arr->type_enum()) { return false; }
-  return EqualsExact(*static_cast<const ListArray*>(arr.get()));
+  return EqualsExact(static_cast<const ListArray&>(*arr.get()));
 }
 
 bool ListArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx,
@@ -333,7 +334,7 @@ BinaryArray::BinaryArray(int32_t length, const std::shared_ptr<Buffer>& offsets,
     const std::shared_ptr<Buffer>& null_bitmap)
     : BinaryArray(kBinary, length, offsets, data, null_count, null_bitmap) {}
 
-BinaryArray::BinaryArray(const TypePtr& type, int32_t length,
+BinaryArray::BinaryArray(const std::shared_ptr<DataType>& type, int32_t length,
     const std::shared_ptr<Buffer>& offsets, const std::shared_ptr<Buffer>& data,
     int32_t null_count, const std::shared_ptr<Buffer>& null_bitmap)
     : Array(type, length, null_count, null_bitmap),
@@ -364,7 +365,7 @@ bool BinaryArray::EqualsExact(const BinaryArray& other) const {
 bool BinaryArray::Equals(const std::shared_ptr<Array>& arr) const {
   if (this == arr.get()) { return true; }
   if (this->type_enum() != arr->type_enum()) { return false; }
-  return EqualsExact(*static_cast<const BinaryArray*>(arr.get()));
+  return EqualsExact(static_cast<const BinaryArray&>(*arr.get()));
 }
 
 bool BinaryArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx,
@@ -493,7 +494,7 @@ Status StructArray::Accept(ArrayVisitor* visitor) const {
 // ----------------------------------------------------------------------
 // UnionArray
 
-UnionArray::UnionArray(const TypePtr& type, int32_t length,
+UnionArray::UnionArray(const std::shared_ptr<DataType>& type, int32_t length,
     const std::vector<std::shared_ptr<Array>>& children,
     const std::shared_ptr<Buffer>& type_ids, const std::shared_ptr<Buffer>& offsets,
     int32_t null_count, const std::shared_ptr<Buffer>& null_bitmap)
@@ -587,13 +588,73 @@ Status UnionArray::Accept(ArrayVisitor* visitor) const {
 }
 
 // ----------------------------------------------------------------------
+// DictionaryArray
+
+Status DictionaryArray::FromBuffer(const std::shared_ptr<DataType>& type, int32_t length,
+    const std::shared_ptr<Buffer>& indices, int32_t null_count,
+    const std::shared_ptr<Buffer>& null_bitmap, std::shared_ptr<DictionaryArray>* out) {
+  DCHECK_EQ(type->type, Type::DICTIONARY);
+  const auto& dict_type = static_cast<const DictionaryType*>(type.get());
+
+  std::shared_ptr<Array> boxed_indices;
+  RETURN_NOT_OK(MakePrimitiveArray(
+      dict_type->index_type(), length, indices, null_count, null_bitmap, &boxed_indices));
+
+  *out = std::make_shared<DictionaryArray>(type, boxed_indices);
+  return Status::OK();
+}
+
+DictionaryArray::DictionaryArray(
+    const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& indices)
+    : Array(type, indices->length(), indices->null_count(), indices->null_bitmap()),
+      dict_type_(static_cast<const DictionaryType*>(type.get())),
+      indices_(indices) {
+  DCHECK_EQ(type->type, Type::DICTIONARY);
+}
+
+Status DictionaryArray::Validate() const {
+  Type::type index_type_id = indices_->type()->type;
+  if (!is_integer(index_type_id)) {
+    return Status::Invalid("Dictionary indices must be integer type");
+  }
+  return Status::OK();
+}
+
+std::shared_ptr<Array> DictionaryArray::dictionary() const {
+  return dict_type_->dictionary();
+}
+
+bool DictionaryArray::EqualsExact(const DictionaryArray& other) const {
+  if (!dictionary()->Equals(other.dictionary())) { return false; }
+  return indices_->Equals(other.indices());
+}
+
+bool DictionaryArray::Equals(const std::shared_ptr<Array>& arr) const {
+  if (this == arr.get()) { return true; }
+  if (Type::DICTIONARY != arr->type_enum()) { return false; }
+  return EqualsExact(static_cast<const DictionaryArray&>(*arr.get()));
+}
+
+bool DictionaryArray::RangeEquals(int32_t start_idx, int32_t end_idx,
+    int32_t other_start_idx, const std::shared_ptr<Array>& arr) const {
+  if (Type::DICTIONARY != arr->type_enum()) { return false; }
+  const auto& dict_other = static_cast<const DictionaryArray&>(*arr.get());
+  if (!dictionary()->Equals(dict_other.dictionary())) { return false; }
+  return indices_->RangeEquals(start_idx, end_idx, other_start_idx, dict_other.indices());
+}
+
+Status DictionaryArray::Accept(ArrayVisitor* visitor) const {
+  return visitor->Visit(*this);
+}
+
+// ----------------------------------------------------------------------
 
 #define MAKE_PRIMITIVE_ARRAY_CASE(ENUM, ArrayType)                          \
   case Type::ENUM:                                                          \
     out->reset(new ArrayType(type, length, data, null_count, null_bitmap)); \
     break;
 
-Status MakePrimitiveArray(const TypePtr& type, int32_t length,
+Status MakePrimitiveArray(const std::shared_ptr<DataType>& type, int32_t length,
     const std::shared_ptr<Buffer>& data, int32_t null_count,
     const std::shared_ptr<Buffer>& null_bitmap, std::shared_ptr<Array>* out) {
   switch (type->type) {
@@ -610,7 +671,6 @@ Status MakePrimitiveArray(const TypePtr& type, int32_t length,
     MAKE_PRIMITIVE_ARRAY_CASE(DOUBLE, DoubleArray);
     MAKE_PRIMITIVE_ARRAY_CASE(TIME, Int64Array);
     MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP, TimestampArray);
-    MAKE_PRIMITIVE_ARRAY_CASE(TIMESTAMP_DOUBLE, DoubleArray);
     default:
       return Status::NotImplemented(type->ToString());
   }

http://git-wip-us.apache.org/repos/asf/arrow/blob/74685f38/cpp/src/arrow/array.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index cd42a28..57214c4 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -26,6 +26,7 @@
 
 #include "arrow/buffer.h"
 #include "arrow/type.h"
+#include "arrow/type_fwd.h"
 #include "arrow/util/bit-util.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/visibility.h"
@@ -36,6 +37,34 @@ class MemoryPool;
 class MutableBuffer;
 class Status;
 
+class ArrayVisitor {
+ public:
+  virtual Status Visit(const NullArray& array) = 0;
+  virtual Status Visit(const BooleanArray& array) = 0;
+  virtual Status Visit(const Int8Array& array) = 0;
+  virtual Status Visit(const Int16Array& array) = 0;
+  virtual Status Visit(const Int32Array& array) = 0;
+  virtual Status Visit(const Int64Array& array) = 0;
+  virtual Status Visit(const UInt8Array& array) = 0;
+  virtual Status Visit(const UInt16Array& array) = 0;
+  virtual Status Visit(const UInt32Array& array) = 0;
+  virtual Status Visit(const UInt64Array& array) = 0;
+  virtual Status Visit(const HalfFloatArray& array) = 0;
+  virtual Status Visit(const FloatArray& array) = 0;
+  virtual Status Visit(const DoubleArray& array) = 0;
+  virtual Status Visit(const StringArray& array) = 0;
+  virtual Status Visit(const BinaryArray& array) = 0;
+  virtual Status Visit(const DateArray& array) = 0;
+  virtual Status Visit(const TimeArray& array) = 0;
+  virtual Status Visit(const TimestampArray& array) = 0;
+  virtual Status Visit(const IntervalArray& array) = 0;
+  virtual Status Visit(const DecimalArray& array) = 0;
+  virtual Status Visit(const ListArray& array) = 0;
+  virtual Status Visit(const StructArray& array) = 0;
+  virtual Status Visit(const UnionArray& array) = 0;
+  virtual Status Visit(const DictionaryArray& type) = 0;
+};
+
 // Immutable data array with some logical type and some length. Any memory is
 // owned by the respective Buffer instance (or its parents).
 //
@@ -63,6 +92,7 @@ class ARROW_EXPORT Array {
 
   const uint8_t* null_bitmap_data() const { return null_bitmap_data_; }
 
+  bool BaseEquals(const std::shared_ptr<Array>& arr) const;
   bool EqualsExact(const Array& arr) const;
   virtual bool Equals(const std::shared_ptr<Array>& arr) const = 0;
   virtual bool ApproxEquals(const std::shared_ptr<Array>& arr) const;
@@ -122,8 +152,9 @@ class ARROW_EXPORT PrimitiveArray : public Array {
   bool Equals(const std::shared_ptr<Array>& arr) const override;
 
  protected:
-  PrimitiveArray(const TypePtr& type, int32_t length, const std::shared_ptr<Buffer>& data,
-      int32_t null_count = 0, const std::shared_ptr<Buffer>& null_bitmap = nullptr);
+  PrimitiveArray(const std::shared_ptr<DataType>& type, int32_t length,
+      const std::shared_ptr<Buffer>& data, int32_t null_count = 0,
+      const std::shared_ptr<Buffer>& null_bitmap = nullptr);
   std::shared_ptr<Buffer> data_;
   const uint8_t* raw_data_;
 };
@@ -137,8 +168,9 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray {
       int32_t null_count = 0, const std::shared_ptr<Buffer>& null_bitmap = nullptr)
       : PrimitiveArray(
             std::make_shared<TypeClass>(), length, data, null_count, null_bitmap) {}
-  NumericArray(const TypePtr& type, int32_t length, const std::shared_ptr<Buffer>& data,
-      int32_t null_count = 0, const std::shared_ptr<Buffer>& null_bitmap = nullptr)
+  NumericArray(const std::shared_ptr<DataType>& type, int32_t length,
+      const std::shared_ptr<Buffer>& data, int32_t null_count = 0,
+      const std::shared_ptr<Buffer>& null_bitmap = nullptr)
       : PrimitiveArray(type, length, data, null_count, null_bitmap) {}
 
   bool EqualsExact(const NumericArray<TypeClass>& other) const {
@@ -146,7 +178,7 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray {
   }
 
   bool ApproxEquals(const std::shared_ptr<Array>& arr) const override {
-    return Equals(arr);
+    return PrimitiveArray::Equals(arr);
   }
 
   bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx,
@@ -250,8 +282,9 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray {
 
   BooleanArray(int32_t length, const std::shared_ptr<Buffer>& data,
       int32_t null_count = 0, const std::shared_ptr<Buffer>& null_bitmap = nullptr);
-  BooleanArray(const TypePtr& type, int32_t length, const std::shared_ptr<Buffer>& data,
-      int32_t null_count = 0, const std::shared_ptr<Buffer>& null_bitmap = nullptr);
+  BooleanArray(const std::shared_ptr<DataType>& type, int32_t length,
+      const std::shared_ptr<Buffer>& data, int32_t null_count = 0,
+      const std::shared_ptr<Buffer>& null_bitmap = nullptr);
 
   bool EqualsExact(const BooleanArray& other) const;
   bool Equals(const std::shared_ptr<Array>& arr) const override;
@@ -272,9 +305,9 @@ class ARROW_EXPORT ListArray : public Array {
  public:
   using TypeClass = ListType;
 
-  ListArray(const TypePtr& type, int32_t length, const std::shared_ptr<Buffer>& offsets,
-      const std::shared_ptr<Array>& values, int32_t null_count = 0,
-      const std::shared_ptr<Buffer>& null_bitmap = nullptr)
+  ListArray(const std::shared_ptr<DataType>& type, int32_t length,
+      const std::shared_ptr<Buffer>& offsets, const std::shared_ptr<Array>& values,
+      int32_t null_count = 0, const std::shared_ptr<Buffer>& null_bitmap = nullptr)
       : Array(type, length, null_count, null_bitmap) {
     offsets_buffer_ = offsets;
     offsets_ = offsets == nullptr ? nullptr : reinterpret_cast<const int32_t*>(
@@ -328,9 +361,9 @@ class ARROW_EXPORT BinaryArray : public Array {
 
   // Constructor that allows sub-classes/builders to propagate there logical type up the
   // class hierarchy.
-  BinaryArray(const TypePtr& type, int32_t length, const std::shared_ptr<Buffer>& offsets,
-      const std::shared_ptr<Buffer>& data, int32_t null_count = 0,
-      const std::shared_ptr<Buffer>& null_bitmap = nullptr);
+  BinaryArray(const std::shared_ptr<DataType>& type, int32_t length,
+      const std::shared_ptr<Buffer>& offsets, const std::shared_ptr<Buffer>& data,
+      int32_t null_count = 0, const std::shared_ptr<Buffer>& null_bitmap = nullptr);
 
   // Return the pointer to the given elements bytes
   // TODO(emkornfield) introduce a StringPiece or something similar to capture zero-copy
@@ -397,7 +430,7 @@ class ARROW_EXPORT StructArray : public Array {
  public:
   using TypeClass = StructType;
 
-  StructArray(const TypePtr& type, int32_t length,
+  StructArray(const std::shared_ptr<DataType>& type, int32_t length,
       const std::vector<std::shared_ptr<Array>>& field_arrays, int32_t null_count = 0,
       std::shared_ptr<Buffer> null_bitmap = nullptr)
       : Array(type, length, null_count, null_bitmap) {
@@ -434,7 +467,7 @@ class ARROW_EXPORT UnionArray : public Array {
  public:
   using TypeClass = UnionType;
 
-  UnionArray(const TypePtr& type, int32_t length,
+  UnionArray(const std::shared_ptr<DataType>& type, int32_t length,
       const std::vector<std::shared_ptr<Array>>& children,
       const std::shared_ptr<Buffer>& type_ids,
       const std::shared_ptr<Buffer>& offsets = nullptr, int32_t null_count = 0,
@@ -474,6 +507,54 @@ class ARROW_EXPORT UnionArray : public Array {
 };
 
 // ----------------------------------------------------------------------
+// DictionaryArray (categorical and dictionary-encoded in memory)
+
+// A dictionary array contains an array of non-negative integers (the
+// "dictionary indices") along with a data type containing a "dictionary"
+// corresponding to the distinct values represented in the data.
+//
+// For example, the array
+//
+//   ["foo", "bar", "foo", "bar", "foo", "bar"]
+//
+// with dictionary ["bar", "foo"], would have dictionary array representation
+//
+//   indices: [1, 0, 1, 0, 1, 0]
+//   dictionary: ["bar", "foo"]
+//
+// The indices in principle may have any integer type (signed or unsigned),
+// though presently data in IPC exchanges must be signed int32.
+class ARROW_EXPORT DictionaryArray : public Array {
+ public:
+  using TypeClass = DictionaryType;
+
+  DictionaryArray(
+      const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& indices);
+
+  // Alternate ctor; other attributes (like null count) are inherited from the
+  // passed indices array
+  static Status FromBuffer(const std::shared_ptr<DataType>& type, int32_t length,
+      const std::shared_ptr<Buffer>& indices, int32_t null_count,
+      const std::shared_ptr<Buffer>& null_bitmap, std::shared_ptr<DictionaryArray>* out);
+
+  Status Validate() const override;
+
+  std::shared_ptr<Array> indices() const { return indices_; }
+  std::shared_ptr<Array> dictionary() const;
+
+  bool EqualsExact(const DictionaryArray& other) const;
+  bool Equals(const std::shared_ptr<Array>& arr) const override;
+  bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx,
+      const std::shared_ptr<Array>& arr) const override;
+
+  Status Accept(ArrayVisitor* visitor) const override;
+
+ protected:
+  const DictionaryType* dict_type_;
+  std::shared_ptr<Array> indices_;
+};
+
+// ----------------------------------------------------------------------
 // extern templates and other details
 
 // gcc and clang disagree about how to handle template visibility when you have

http://git-wip-us.apache.org/repos/asf/arrow/blob/74685f38/cpp/src/arrow/ipc/adapter.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc
index 9bfd11f..2b5ef11 100644
--- a/cpp/src/arrow/ipc/adapter.cc
+++ b/cpp/src/arrow/ipc/adapter.cc
@@ -288,6 +288,13 @@ class RecordBatchWriter : public ArrayVisitor {
     return Status::OK();
   }
 
+  Status Visit(const DictionaryArray& array) override {
+    // Dictionary written out separately
+    const auto& indices = static_cast<const PrimitiveArray&>(*array.indices().get());
+    buffers_.push_back(indices.data());
+    return Status::OK();
+  }
+
   // Do not copy this vector. Ownership must be retained elsewhere
   const std::vector<std::shared_ptr<Array>>& columns_;
   int32_t num_rows_;
@@ -539,6 +546,10 @@ class ArrayLoader : public TypeVisitor {
         type_ids, offsets, field_meta.null_count, null_bitmap);
     return Status::OK();
   }
+
+  Status Visit(const DictionaryType& type) override {
+    return Status::NotImplemented("dictionary");
+  };
 };
 
 class RecordBatchReader {

http://git-wip-us.apache.org/repos/asf/arrow/blob/74685f38/cpp/src/arrow/ipc/json-internal.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc
index 4f980d3..43bd8a4 100644
--- a/cpp/src/arrow/ipc/json-internal.cc
+++ b/cpp/src/arrow/ipc/json-internal.cc
@@ -334,6 +334,14 @@ class JsonSchemaWriter : public TypeVisitor {
     return Status::OK();
   }
 
+  Status Visit(const DictionaryType& type) override {
+    // WriteName("dictionary", type);
+    // WriteChildren(type.children());
+    // WriteBufferLayout(type.GetBufferLayout());
+    // return Status::OK();
+    return Status::NotImplemented("dictionary type");
+  }
+
  private:
   const Schema& schema_;
   RjWriter* writer_;
@@ -546,6 +554,10 @@ class JsonArrayWriter : public ArrayVisitor {
     return WriteChildren(type->children(), array.children());
   }
 
+  Status Visit(const DictionaryArray& array) override {
+    return Status::NotImplemented("dictionary");
+  }
+
  private:
   const std::string& name_;
   const Array& array_;
@@ -1043,6 +1055,7 @@ class JsonArrayReader {
       TYPE_CASE(ListType);
       TYPE_CASE(StructType);
       TYPE_CASE(UnionType);
+      NOT_IMPLEMENTED_CASE(DICTIONARY);
       default:
         std::stringstream ss;
         ss << type->ToString();

http://git-wip-us.apache.org/repos/asf/arrow/blob/74685f38/cpp/src/arrow/pretty_print-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/pretty_print-test.cc b/cpp/src/arrow/pretty_print-test.cc
index c22d3aa..4725d5d 100644
--- a/cpp/src/arrow/pretty_print-test.cc
+++ b/cpp/src/arrow/pretty_print-test.cc
@@ -34,7 +34,7 @@
 
 namespace arrow {
 
-class TestArrayPrinter : public ::testing::Test {
+class TestPrettyPrint : public ::testing::Test {
  public:
   void SetUp() {}
 
@@ -44,32 +44,22 @@ class TestArrayPrinter : public ::testing::Test {
   std::ostringstream sink_;
 };
 
+void CheckArray(const Array& arr, int indent, const char* expected) {
+  std::ostringstream sink;
+  ASSERT_OK(PrettyPrint(arr, indent, &sink));
+  std::string result = sink.str();
+  ASSERT_EQ(std::string(expected, strlen(expected)), result);
+}
+
 template <typename TYPE, typename C_TYPE>
 void CheckPrimitive(int indent, const std::vector<bool>& is_valid,
     const std::vector<C_TYPE>& values, const char* expected) {
-  std::ostringstream sink;
-
-  MemoryPool* pool = default_memory_pool();
-  typename TypeTraits<TYPE>::BuilderType builder(pool, std::make_shared<TYPE>());
-
-  for (size_t i = 0; i < values.size(); ++i) {
-    if (is_valid[i]) {
-      ASSERT_OK(builder.Append(values[i]));
-    } else {
-      ASSERT_OK(builder.AppendNull());
-    }
-  }
-
   std::shared_ptr<Array> array;
-  ASSERT_OK(builder.Finish(&array));
-
-  ASSERT_OK(PrettyPrint(*array.get(), indent, &sink));
-
-  std::string result = sink.str();
-  ASSERT_EQ(std::string(expected, strlen(expected)), result);
+  ArrayFromVector<TYPE, C_TYPE>(std::make_shared<TYPE>(), is_valid, values, &array);
+  CheckArray(*array.get(), indent, expected);
 }
 
-TEST_F(TestArrayPrinter, PrimitiveType) {
+TEST_F(TestPrettyPrint, PrimitiveType) {
   std::vector<bool> is_valid = {true, true, false, true, false};
 
   std::vector<int32_t> values = {0, 1, 2, 3, 4};
@@ -81,4 +71,25 @@ TEST_F(TestArrayPrinter, PrimitiveType) {
   CheckPrimitive<StringType, std::string>(0, is_valid, values2, ex2);
 }
 
+TEST_F(TestPrettyPrint, DictionaryType) {
+  std::vector<bool> is_valid = {true, true, false, true, true, true};
+
+  std::shared_ptr<Array> dict;
+  std::vector<std::string> dict_values = {"foo", "bar", "baz"};
+  ArrayFromVector<StringType, std::string>(utf8(), dict_values, &dict);
+  std::shared_ptr<DataType> dict_type = dictionary(int16(), dict);
+
+  std::shared_ptr<Array> indices;
+  std::vector<int16_t> indices_values = {1, 2, -1, 0, 2, 0};
+  ArrayFromVector<Int16Type, int16_t>(int16(), is_valid, indices_values, &indices);
+  auto arr = std::make_shared<DictionaryArray>(dict_type, indices);
+
+  static const char* expected = R"expected(
+-- is_valid: [true, true, false, true, true, true]
+-- dictionary: ["foo", "bar", "baz"]
+-- indices: [1, 2, null, 0, 2, 0])expected";
+
+  CheckArray(*arr.get(), 0, expected);
+}
+
 }  // namespace arrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/74685f38/cpp/src/arrow/pretty_print.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc
index 324f81b..e30f4cc 100644
--- a/cpp/src/arrow/pretty_print.cc
+++ b/cpp/src/arrow/pretty_print.cc
@@ -217,6 +217,18 @@ class ArrayPrinter : public ArrayVisitor {
     return PrintChildren(array.children());
   }
 
+  Status Visit(const DictionaryArray& array) override {
+    RETURN_NOT_OK(WriteValidityBitmap(array));
+
+    Newline();
+    Write("-- dictionary: ");
+    RETURN_NOT_OK(PrettyPrint(*array.dictionary().get(), indent_ + 2, sink_));
+
+    Newline();
+    Write("-- indices: ");
+    return PrettyPrint(*array.indices().get(), indent_ + 2, sink_);
+  }
+
   void Write(const char* data) { (*sink_) << data; }
 
   void Write(const std::string& data) { (*sink_) << data; }

http://git-wip-us.apache.org/repos/asf/arrow/blob/74685f38/cpp/src/arrow/test-util.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h
index 70e9333..e595749 100644
--- a/cpp/src/arrow/test-util.h
+++ b/cpp/src/arrow/test-util.h
@@ -257,33 +257,27 @@ template <typename TYPE, typename C_TYPE>
 void ArrayFromVector(const std::shared_ptr<DataType>& type,
     const std::vector<bool>& is_valid, const std::vector<C_TYPE>& values,
     std::shared_ptr<Array>* out) {
-  std::shared_ptr<Buffer> values_buffer;
-  std::shared_ptr<Buffer> values_bitmap;
-
-  ASSERT_OK(test::CopyBufferFromVector(values, &values_buffer));
-  ASSERT_OK(test::GetBitmapFromBoolVector(is_valid, &values_bitmap));
-
-  using ArrayType = typename TypeTraits<TYPE>::ArrayType;
-
-  int32_t null_count = 0;
-  for (bool val : is_valid) {
-    if (!val) { ++null_count; }
+  MemoryPool* pool = default_memory_pool();
+  typename TypeTraits<TYPE>::BuilderType builder(pool, std::make_shared<TYPE>());
+  for (size_t i = 0; i < values.size(); ++i) {
+    if (is_valid[i]) {
+      ASSERT_OK(builder.Append(values[i]));
+    } else {
+      ASSERT_OK(builder.AppendNull());
+    }
   }
-
-  *out = std::make_shared<ArrayType>(type, static_cast<int32_t>(values.size()),
-      values_buffer, null_count, values_bitmap);
+  ASSERT_OK(builder.Finish(out));
 }
 
 template <typename TYPE, typename C_TYPE>
 void ArrayFromVector(const std::shared_ptr<DataType>& type,
     const std::vector<C_TYPE>& values, std::shared_ptr<Array>* out) {
-  std::shared_ptr<Buffer> values_buffer;
-
-  ASSERT_OK(test::CopyBufferFromVector(values, &values_buffer));
-
-  using ArrayType = typename TypeTraits<TYPE>::ArrayType;
-  *out = std::make_shared<ArrayType>(
-      type, static_cast<int32_t>(values.size()), values_buffer);
+  MemoryPool* pool = default_memory_pool();
+  typename TypeTraits<TYPE>::BuilderType builder(pool, std::make_shared<TYPE>());
+  for (size_t i = 0; i < values.size(); ++i) {
+    ASSERT_OK(builder.Append(values[i]));
+  }
+  ASSERT_OK(builder.Finish(out));
 }
 
 class TestBuilder : public ::testing::Test {

http://git-wip-us.apache.org/repos/asf/arrow/blob/74685f38/cpp/src/arrow/type.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index 89faab6..954fba7 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -20,10 +20,22 @@
 #include <sstream>
 #include <string>
 
+#include "arrow/array.h"
 #include "arrow/status.h"
+#include "arrow/util/logging.h"
 
 namespace arrow {
 
+bool Field::Equals(const Field& other) const {
+  return (this == &other) ||
+         (this->name == other.name && this->nullable == other.nullable &&
+             this->dictionary == dictionary && this->type->Equals(*other.type.get()));
+}
+
+bool Field::Equals(const std::shared_ptr<Field>& other) const {
+  return Equals(*other.get());
+}
+
 std::string Field::ToString() const {
   std::stringstream ss;
   ss << this->name << ": " << this->type->ToString();
@@ -33,14 +45,14 @@ std::string Field::ToString() const {
 
 DataType::~DataType() {}
 
-bool DataType::Equals(const DataType* other) const {
-  bool equals = other && ((this == other) ||
-                             ((this->type == other->type) &&
-                                 ((this->num_children() == other->num_children()))));
+bool DataType::Equals(const DataType& other) const {
+  bool equals =
+      ((this == &other) || ((this->type == other.type) &&
+                               ((this->num_children() == other.num_children()))));
   if (equals) {
     for (int i = 0; i < num_children(); ++i) {
       // TODO(emkornfield) limit recursion
-      if (!children_[i]->Equals(other->children_[i])) { return false; }
+      if (!children_[i]->Equals(other.children_[i])) { return false; }
     }
   }
   return equals;
@@ -109,11 +121,47 @@ std::string UnionType::ToString() const {
   return s.str();
 }
 
+// ----------------------------------------------------------------------
+// DictionaryType
+
+DictionaryType::DictionaryType(
+    const std::shared_ptr<DataType>& index_type, const std::shared_ptr<Array>& dictionary)
+    : FixedWidthType(Type::DICTIONARY),
+      index_type_(index_type),
+      dictionary_(dictionary) {}
+
+int DictionaryType::bit_width() const {
+  return static_cast<const FixedWidthType*>(index_type_.get())->bit_width();
+}
+
+std::shared_ptr<Array> DictionaryType::dictionary() const {
+  return dictionary_;
+}
+
+bool DictionaryType::Equals(const DataType& other) const {
+  if (other.type != Type::DICTIONARY) { return false; }
+  const auto& other_dict = static_cast<const DictionaryType&>(other);
+
+  return index_type_->Equals(other_dict.index_type_) &&
+         dictionary_->Equals(other_dict.dictionary_);
+}
+
+std::string DictionaryType::ToString() const {
+  std::stringstream ss;
+  ss << "dictionary<" << dictionary_->type()->ToString() << ", "
+     << index_type_->ToString() << ">";
+  return ss.str();
+}
+
+// ----------------------------------------------------------------------
+// Null type
+
 std::string NullType::ToString() const {
   return name();
 }
 
-// Visitors and template instantiation
+// ----------------------------------------------------------------------
+// Visitors and factory functions
 
 #define ACCEPT_VISITOR(TYPE) \
   Status TYPE::Accept(TypeVisitor* visitor) const { return visitor->Visit(*this); }
@@ -130,6 +178,7 @@ ACCEPT_VISITOR(DateType);
 ACCEPT_VISITOR(TimeType);
 ACCEPT_VISITOR(TimestampType);
 ACCEPT_VISITOR(IntervalType);
+ACCEPT_VISITOR(DictionaryType);
 
 #define TYPE_FACTORY(NAME, KLASS)                                        \
   std::shared_ptr<DataType> NAME() {                                     \
@@ -174,12 +223,16 @@ std::shared_ptr<DataType> struct_(const std::vector<std::shared_ptr<Field>>& fie
   return std::make_shared<StructType>(fields);
 }
 
-std::shared_ptr<DataType> ARROW_EXPORT union_(
-    const std::vector<std::shared_ptr<Field>>& child_fields,
+std::shared_ptr<DataType> union_(const std::vector<std::shared_ptr<Field>>& child_fields,
     const std::vector<uint8_t>& type_ids, UnionMode mode) {
   return std::make_shared<UnionType>(child_fields, type_ids, mode);
 }
 
+std::shared_ptr<DataType> dictionary(const std::shared_ptr<DataType>& index_type,
+    const std::shared_ptr<Array>& dict_values) {
+  return std::make_shared<DictionaryType>(index_type, dict_values);
+}
+
 std::shared_ptr<Field> field(
     const std::string& name, const TypePtr& type, bool nullable, int64_t dictionary) {
   return std::make_shared<Field>(name, type, nullable, dictionary);

http://git-wip-us.apache.org/repos/asf/arrow/blob/74685f38/cpp/src/arrow/type.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 530c323..c2a762d 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -37,67 +37,64 @@ namespace arrow {
 struct Type {
   enum type {
     // A degenerate NULL type represented as 0 bytes/bits
-    NA = 0,
+    NA,
 
     // A boolean value represented as 1 bit
-    BOOL = 1,
+    BOOL,
 
     // Little-endian integer types
-    UINT8 = 2,
-    INT8 = 3,
-    UINT16 = 4,
-    INT16 = 5,
-    UINT32 = 6,
-    INT32 = 7,
-    UINT64 = 8,
-    INT64 = 9,
+    UINT8,
+    INT8,
+    UINT16,
+    INT16,
+    UINT32,
+    INT32,
+    UINT64,
+    INT64,
 
     // 2-byte floating point value
-    HALF_FLOAT = 10,
+    HALF_FLOAT,
 
     // 4-byte floating point value
-    FLOAT = 11,
+    FLOAT,
 
     // 8-byte floating point value
-    DOUBLE = 12,
+    DOUBLE,
 
     // UTF8 variable-length string as List<Char>
-    STRING = 13,
+    STRING,
 
     // Variable-length bytes (no guarantee of UTF8-ness)
-    BINARY = 14,
+    BINARY,
 
     // By default, int32 days since the UNIX epoch
-    DATE = 16,
+    DATE,
 
     // Exact timestamp encoded with int64 since UNIX epoch
     // Default unit millisecond
-    TIMESTAMP = 17,
+    TIMESTAMP,
 
     // Exact time encoded with int64, default unit millisecond
-    TIME = 18,
+    TIME,
 
     // YEAR_MONTH or DAY_TIME interval in SQL style
-    INTERVAL = 19,
+    INTERVAL,
 
     // Precision- and scale-based decimal type. Storage type depends on the
     // parameters.
-    DECIMAL = 20,
+    DECIMAL,
 
     // A list of some logical data type
-    LIST = 30,
+    LIST,
 
     // Struct of logical types
-    STRUCT = 31,
+    STRUCT,
 
     // Unions of logical types
-    UNION = 32,
+    UNION,
 
-    // Timestamp as double seconds since the UNIX epoch
-    TIMESTAMP_DOUBLE = 33,
-
-    // Decimal value encoded as a text string
-    DECIMAL_TEXT = 34,
+    // Dictionary aka Category type
+    DICTIONARY
   };
 };
 
@@ -115,6 +112,34 @@ class BufferDescr {
   int bit_width_;
 };
 
+class TypeVisitor {
+ public:
+  virtual Status Visit(const NullType& type) = 0;
+  virtual Status Visit(const BooleanType& type) = 0;
+  virtual Status Visit(const Int8Type& type) = 0;
+  virtual Status Visit(const Int16Type& type) = 0;
+  virtual Status Visit(const Int32Type& type) = 0;
+  virtual Status Visit(const Int64Type& type) = 0;
+  virtual Status Visit(const UInt8Type& type) = 0;
+  virtual Status Visit(const UInt16Type& type) = 0;
+  virtual Status Visit(const UInt32Type& type) = 0;
+  virtual Status Visit(const UInt64Type& type) = 0;
+  virtual Status Visit(const HalfFloatType& type) = 0;
+  virtual Status Visit(const FloatType& type) = 0;
+  virtual Status Visit(const DoubleType& type) = 0;
+  virtual Status Visit(const StringType& type) = 0;
+  virtual Status Visit(const BinaryType& type) = 0;
+  virtual Status Visit(const DateType& type) = 0;
+  virtual Status Visit(const TimeType& type) = 0;
+  virtual Status Visit(const TimestampType& type) = 0;
+  virtual Status Visit(const IntervalType& type) = 0;
+  virtual Status Visit(const DecimalType& type) = 0;
+  virtual Status Visit(const ListType& type) = 0;
+  virtual Status Visit(const StructType& type) = 0;
+  virtual Status Visit(const UnionType& type) = 0;
+  virtual Status Visit(const DictionaryType& type) = 0;
+};
+
 struct ARROW_EXPORT DataType {
   Type::type type;
 
@@ -128,10 +153,10 @@ struct ARROW_EXPORT DataType {
   //
   // Types that are logically convertable from one to another e.g. List<UInt8>
   // and Binary are NOT equal).
-  virtual bool Equals(const DataType* other) const;
+  virtual bool Equals(const DataType& other) const;
 
   bool Equals(const std::shared_ptr<DataType>& other) const {
-    return Equals(other.get());
+    return Equals(*other.get());
   }
 
   std::shared_ptr<Field> child(int i) const { return children_[i]; }
@@ -189,16 +214,9 @@ struct ARROW_EXPORT Field {
       : name(name), type(type), nullable(nullable), dictionary(dictionary) {}
 
   bool operator==(const Field& other) const { return this->Equals(other); }
-
   bool operator!=(const Field& other) const { return !this->Equals(other); }
-
-  bool Equals(const Field& other) const {
-    return (this == &other) ||
-           (this->name == other.name && this->nullable == other.nullable &&
-               this->dictionary == dictionary && this->type->Equals(other.type.get()));
-  }
-
-  bool Equals(const std::shared_ptr<Field>& other) const { return Equals(*other.get()); }
+  bool Equals(const Field& other) const;
+  bool Equals(const std::shared_ptr<Field>& other) const;
 
   std::string ToString() const;
 };
@@ -414,6 +432,9 @@ struct ARROW_EXPORT UnionType : public DataType {
   std::vector<uint8_t> type_ids;
 };
 
+// ----------------------------------------------------------------------
+// Date and time types
+
 struct ARROW_EXPORT DateType : public FixedWidthType {
   static constexpr Type::type type_id = Type::DATE;
 
@@ -488,6 +509,35 @@ struct ARROW_EXPORT IntervalType : public FixedWidthType {
   static std::string name() { return "date"; }
 };
 
+// ----------------------------------------------------------------------
+// DictionaryType (for categorical or dictionary-encoded data)
+
+class ARROW_EXPORT DictionaryType : public FixedWidthType {
+ public:
+  static constexpr Type::type type_id = Type::DICTIONARY;
+
+  DictionaryType(const std::shared_ptr<DataType>& index_type,
+      const std::shared_ptr<Array>& dictionary);
+
+  int bit_width() const override;
+
+  std::shared_ptr<DataType> index_type() const { return index_type_; }
+
+  std::shared_ptr<Array> dictionary() const;
+
+  bool Equals(const DataType& other) const override;
+
+  Status Accept(TypeVisitor* visitor) const override;
+  std::string ToString() const override;
+
+ private:
+  // Must be an integer type (not currently checked)
+  std::shared_ptr<DataType> index_type_;
+
+  std::shared_ptr<Array> dictionary_;
+};
+
+// ----------------------------------------------------------------------
 // Factory functions
 
 std::shared_ptr<DataType> ARROW_EXPORT null();
@@ -520,9 +570,44 @@ std::shared_ptr<DataType> ARROW_EXPORT union_(
     const std::vector<std::shared_ptr<Field>>& child_fields,
     const std::vector<uint8_t>& type_ids, UnionMode mode = UnionMode::SPARSE);
 
+std::shared_ptr<DataType> ARROW_EXPORT dictionary(
+    const std::shared_ptr<DataType>& index_type, const std::shared_ptr<Array>& values);
+
 std::shared_ptr<Field> ARROW_EXPORT field(const std::string& name,
     const std::shared_ptr<DataType>& type, bool nullable = true, int64_t dictionary = 0);
 
+// ----------------------------------------------------------------------
+//
+
+static inline bool is_integer(Type::type type_id) {
+  switch (type_id) {
+    case Type::UINT8:
+    case Type::INT8:
+    case Type::UINT16:
+    case Type::INT16:
+    case Type::UINT32:
+    case Type::INT32:
+    case Type::UINT64:
+    case Type::INT64:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
+static inline bool is_floating(Type::type type_id) {
+  switch (type_id) {
+    case Type::HALF_FLOAT:
+    case Type::FLOAT:
+    case Type::DOUBLE:
+      return true;
+    default:
+      break;
+  }
+  return false;
+}
+
 }  // namespace arrow
 
 #endif  // ARROW_TYPE_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/74685f38/cpp/src/arrow/type_fwd.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h
index a14c535..334abef 100644
--- a/cpp/src/arrow/type_fwd.h
+++ b/cpp/src/arrow/type_fwd.h
@@ -32,6 +32,9 @@ class MemoryPool;
 class RecordBatch;
 class Schema;
 
+class DictionaryType;
+class DictionaryArray;
+
 struct NullType;
 class NullArray;
 
@@ -101,60 +104,6 @@ using TimestampBuilder = NumericBuilder<TimestampType>;
 struct IntervalType;
 using IntervalArray = NumericArray<IntervalType>;
 
-class TypeVisitor {
- public:
-  virtual Status Visit(const NullType& type) = 0;
-  virtual Status Visit(const BooleanType& type) = 0;
-  virtual Status Visit(const Int8Type& type) = 0;
-  virtual Status Visit(const Int16Type& type) = 0;
-  virtual Status Visit(const Int32Type& type) = 0;
-  virtual Status Visit(const Int64Type& type) = 0;
-  virtual Status Visit(const UInt8Type& type) = 0;
-  virtual Status Visit(const UInt16Type& type) = 0;
-  virtual Status Visit(const UInt32Type& type) = 0;
-  virtual Status Visit(const UInt64Type& type) = 0;
-  virtual Status Visit(const HalfFloatType& type) = 0;
-  virtual Status Visit(const FloatType& type) = 0;
-  virtual Status Visit(const DoubleType& type) = 0;
-  virtual Status Visit(const StringType& type) = 0;
-  virtual Status Visit(const BinaryType& type) = 0;
-  virtual Status Visit(const DateType& type) = 0;
-  virtual Status Visit(const TimeType& type) = 0;
-  virtual Status Visit(const TimestampType& type) = 0;
-  virtual Status Visit(const IntervalType& type) = 0;
-  virtual Status Visit(const DecimalType& type) = 0;
-  virtual Status Visit(const ListType& type) = 0;
-  virtual Status Visit(const StructType& type) = 0;
-  virtual Status Visit(const UnionType& type) = 0;
-};
-
-class ArrayVisitor {
- public:
-  virtual Status Visit(const NullArray& array) = 0;
-  virtual Status Visit(const BooleanArray& array) = 0;
-  virtual Status Visit(const Int8Array& array) = 0;
-  virtual Status Visit(const Int16Array& array) = 0;
-  virtual Status Visit(const Int32Array& array) = 0;
-  virtual Status Visit(const Int64Array& array) = 0;
-  virtual Status Visit(const UInt8Array& array) = 0;
-  virtual Status Visit(const UInt16Array& array) = 0;
-  virtual Status Visit(const UInt32Array& array) = 0;
-  virtual Status Visit(const UInt64Array& array) = 0;
-  virtual Status Visit(const HalfFloatArray& array) = 0;
-  virtual Status Visit(const FloatArray& array) = 0;
-  virtual Status Visit(const DoubleArray& array) = 0;
-  virtual Status Visit(const StringArray& array) = 0;
-  virtual Status Visit(const BinaryArray& array) = 0;
-  virtual Status Visit(const DateArray& array) = 0;
-  virtual Status Visit(const TimeArray& array) = 0;
-  virtual Status Visit(const TimestampArray& array) = 0;
-  virtual Status Visit(const IntervalArray& array) = 0;
-  virtual Status Visit(const DecimalArray& array) = 0;
-  virtual Status Visit(const ListArray& array) = 0;
-  virtual Status Visit(const StructArray& array) = 0;
-  virtual Status Visit(const UnionArray& array) = 0;
-};
-
 }  // namespace arrow
 
 #endif  // ARROW_TYPE_FWD_H

http://git-wip-us.apache.org/repos/asf/arrow/blob/74685f38/format/Message.fbs
----------------------------------------------------------------------
diff --git a/format/Message.fbs b/format/Message.fbs
index d07d066..b2c6464 100644
--- a/format/Message.fbs
+++ b/format/Message.fbs
@@ -256,7 +256,7 @@ table RecordBatch {
 /// For sending dictionary encoding information. Any Field can be
 /// dictionary-encoded, but in this case none of its children may be
 /// dictionary-encoded.
-/// There is one dictionary batch per dictionary
+/// There is one vector / column per dictionary
 ///
 
 table DictionaryBatch {

http://git-wip-us.apache.org/repos/asf/arrow/blob/74685f38/python/pyarrow/includes/libarrow.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 40fb60d..3cdfe49 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -55,7 +55,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
     cdef cppclass CDataType" arrow::DataType":
         Type type
 
-        c_bool Equals(const CDataType* other)
+        c_bool Equals(const shared_ptr[CDataType]& other)
+        c_bool Equals(const CDataType& other)
 
         c_string ToString()
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/74685f38/python/pyarrow/includes/parquet.pxd
----------------------------------------------------------------------
diff --git a/python/pyarrow/includes/parquet.pxd b/python/pyarrow/includes/parquet.pxd
index b4d127c..d9e121d 100644
--- a/python/pyarrow/includes/parquet.pxd
+++ b/python/pyarrow/includes/parquet.pxd
@@ -98,7 +98,7 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil:
         # TODO: Some default arguments are missing
         @staticmethod
         unique_ptr[ParquetFileReader] OpenFile(const c_string& path)
-        const FileMetaData* metadata();
+        shared_ptr[FileMetaData] metadata();
 
 
 cdef extern from "parquet/api/writer.h" namespace "parquet" nogil:

http://git-wip-us.apache.org/repos/asf/arrow/blob/74685f38/python/pyarrow/parquet.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/parquet.pyx b/python/pyarrow/parquet.pyx
index 7379456..c092185 100644
--- a/python/pyarrow/parquet.pyx
+++ b/python/pyarrow/parquet.pyx
@@ -98,8 +98,8 @@ cdef class ParquetReader:
             Integer index of the position of the column
         """
         cdef:
-            const FileMetaData* metadata = (self.reader.get()
-                                            .parquet_reader().metadata())
+            const FileMetaData* metadata = (self.reader.get().parquet_reader()
+                                            .metadata().get())
             int i = 0
 
         if self.column_idx_map is None:

http://git-wip-us.apache.org/repos/asf/arrow/blob/74685f38/python/pyarrow/schema.pyx
----------------------------------------------------------------------
diff --git a/python/pyarrow/schema.pyx b/python/pyarrow/schema.pyx
index 7a69b0f..d91ae7c 100644
--- a/python/pyarrow/schema.pyx
+++ b/python/pyarrow/schema.pyx
@@ -45,9 +45,9 @@ cdef class DataType:
 
     def __richcmp__(DataType self, DataType other, int op):
         if op == cpython.Py_EQ:
-            return self.type.Equals(other.type)
+            return self.type.Equals(other.sp_type)
         elif op == cpython.Py_NE:
-            return not self.type.Equals(other.type)
+            return not self.type.Equals(other.sp_type)
         else:
             raise TypeError('Invalid comparison')