You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/09/11 17:12:40 UTC

arrow git commit: ARROW-1513: C++: Add cast from Dictionary to plain arrays [Forced Update!]

Repository: arrow
Updated Branches:
  refs/heads/master a1237a28f -> 7e9576426 (forced update)


ARROW-1513: C++: Add cast from Dictionary to plain arrays

Author: Wes McKinney <we...@twosigma.com>
Author: Uwe L. Korn <uw...@xhochy.com>

Closes #1086 from xhochy/ARROW-1513 and squashes the following commits:

769b205e [Wes McKinney] Fix compiler error / warning. Change type checks to DCHECK
05c539f2 [Uwe L. Korn] ARROW-1513: C++: Add cast from Dictionary to plain arrays

Change-Id: I80d444e7a0661d58909801004ac9818dece04a15


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/7e957642
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/7e957642
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/7e957642

Branch: refs/heads/master
Commit: 7e95764263c34a3bfabf0ca9909f425c16042141
Parents: 94f6247
Author: Uwe L. Korn <uw...@xhochy.com>
Authored: Mon Sep 11 13:11:37 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Mon Sep 11 13:12:24 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/compute/cast.cc         | 344 +++++++++++++++++++++++++----
 cpp/src/arrow/compute/compute-test.cc |  26 ++-
 cpp/src/arrow/table-test.cc           |  66 +++---
 cpp/src/arrow/test-common.h           |  66 +++++-
 cpp/src/arrow/type.h                  |   5 +
 cpp/src/arrow/util/bit-util.h         |   6 +-
 6 files changed, 429 insertions(+), 84 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/7e957642/cpp/src/arrow/compute/cast.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/compute/cast.cc b/cpp/src/arrow/compute/cast.cc
index 3885fdf..c651244 100644
--- a/cpp/src/arrow/compute/cast.cc
+++ b/cpp/src/arrow/compute/cast.cc
@@ -28,6 +28,8 @@
 
 #include "arrow/array.h"
 #include "arrow/buffer.h"
+#include "arrow/builder.h"
+#include "arrow/compare.h"
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bit-util.h"
@@ -37,6 +39,32 @@
 #include "arrow/compute/context.h"
 #include "arrow/compute/kernel.h"
 
+#ifdef ARROW_EXTRA_ERROR_CONTEXT
+
+#define FUNC_RETURN_NOT_OK(s)                                                       \
+  do {                                                                              \
+    Status _s = (s);                                                                \
+    if (ARROW_PREDICT_FALSE(!_s.ok())) {                                            \
+      std::stringstream ss;                                                         \
+      ss << __FILE__ << ":" << __LINE__ << " code: " << #s << "\n" << _s.message(); \
+      ctx->SetStatus(Status(_s.code(), ss.str()));                                  \
+      return;                                                                       \
+    }                                                                               \
+  } while (0)
+
+#else
+
+#define FUNC_RETURN_NOT_OK(s)            \
+  do {                                   \
+    Status _s = (s);                     \
+    if (ARROW_PREDICT_FALSE(!_s.ok())) { \
+      ctx->SetStatus(_s);                \
+      return;                            \
+    }                                    \
+  } while (0)
+
+#endif  // ARROW_EXTRA_ERROR_CONTEXT
+
 namespace arrow {
 namespace compute {
 
@@ -226,19 +254,219 @@ struct CastFunctor<O, I,
 };
 
 // ----------------------------------------------------------------------
+// Dictionary to other things
+
+template <typename IndexType>
+void UnpackFixedSizeBinaryDictionary(FunctionContext* ctx, const Array& indices,
+                                     const FixedSizeBinaryArray& dictionary,
+                                     ArrayData* output) {
+  using index_c_type = typename IndexType::c_type;
+  const uint8_t* valid_bits = indices.null_bitmap_data();
+  INIT_BITSET(valid_bits, indices.offset());
+
+  const index_c_type* in =
+      reinterpret_cast<const index_c_type*>(indices.data()->buffers[1]->data()) +
+      indices.offset();
+  uint8_t* out = output->buffers[1]->mutable_data();
+  int32_t byte_width =
+      static_cast<const FixedSizeBinaryType&>(*output->type).byte_width();
+  for (int64_t i = 0; i < indices.length(); ++i) {
+    if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
+      const uint8_t* value = dictionary.Value(in[i]);
+      memcpy(out + i * byte_width, value, byte_width);
+    }
+    READ_NEXT_BITSET(valid_bits);
+  }
+}
+
+template <typename T>
+struct CastFunctor<
+    T, DictionaryType,
+    typename std::enable_if<std::is_base_of<FixedSizeBinaryType, T>::value>::type> {
+  void operator()(FunctionContext* ctx, const CastOptions& options, const Array& input,
+                  ArrayData* output) {
+    const DictionaryArray& dict_array = static_cast<const DictionaryArray&>(input);
+    const DictionaryType& type = static_cast<const DictionaryType&>(*input.type());
+    const DataType& values_type = *type.dictionary()->type();
+    const FixedSizeBinaryArray& dictionary =
+        static_cast<const FixedSizeBinaryArray&>(*type.dictionary());
+
+    // Check if values and output type match
+    DCHECK(values_type.Equals(*output->type))
+      << "Dictionary type: " << values_type
+      << " target type: " << (*output->type);
+
+    const Array& indices = *dict_array.indices();
+    switch (indices.type()->id()) {
+      case Type::INT8:
+        UnpackFixedSizeBinaryDictionary<Int8Type>(ctx, indices, dictionary, output);
+        break;
+      case Type::INT16:
+        UnpackFixedSizeBinaryDictionary<Int16Type>(ctx, indices, dictionary, output);
+        break;
+      case Type::INT32:
+        UnpackFixedSizeBinaryDictionary<Int32Type>(ctx, indices, dictionary, output);
+        break;
+      case Type::INT64:
+        UnpackFixedSizeBinaryDictionary<Int64Type>(ctx, indices, dictionary, output);
+        break;
+      default:
+        std::stringstream ss;
+        ss << "Invalid index type: " << indices.type()->ToString();
+        ctx->SetStatus(Status::Invalid(ss.str()));
+        return;
+    }
+  }
+};
+
+template <typename IndexType>
+Status UnpackBinaryDictionary(FunctionContext* ctx, const Array& indices,
+                              const BinaryArray& dictionary, ArrayData* output) {
+  using index_c_type = typename IndexType::c_type;
+  std::unique_ptr<ArrayBuilder> builder;
+  RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), output->type, &builder));
+  BinaryBuilder* binary_builder = static_cast<BinaryBuilder*>(builder.get());
+
+  const uint8_t* valid_bits = indices.null_bitmap_data();
+  INIT_BITSET(valid_bits, indices.offset());
+
+  const index_c_type* in =
+      reinterpret_cast<const index_c_type*>(indices.data()->buffers[1]->data()) +
+      indices.offset();
+  for (int64_t i = 0; i < indices.length(); ++i) {
+    if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
+      int32_t length;
+      const uint8_t* value = dictionary.GetValue(in[i], &length);
+      RETURN_NOT_OK(binary_builder->Append(value, length));
+    } else {
+      RETURN_NOT_OK(binary_builder->AppendNull());
+    }
+    READ_NEXT_BITSET(valid_bits);
+  }
+
+  std::shared_ptr<Array> plain_array;
+  RETURN_NOT_OK(binary_builder->Finish(&plain_array));
+  // Copy all buffer except the valid bitmap
+  for (size_t i = 1; i < plain_array->data()->buffers.size(); i++) {
+    output->buffers.push_back(plain_array->data()->buffers[i]);
+  }
+
+  return Status::OK();
+}
+
+template <typename T>
+struct CastFunctor<T, DictionaryType,
+                   typename std::enable_if<std::is_base_of<BinaryType, T>::value>::type> {
+  void operator()(FunctionContext* ctx, const CastOptions& options, const Array& input,
+                  ArrayData* output) {
+    const DictionaryArray& dict_array = static_cast<const DictionaryArray&>(input);
+    const DictionaryType& type = static_cast<const DictionaryType&>(*input.type());
+    const DataType& values_type = *type.dictionary()->type();
+    const BinaryArray& dictionary = static_cast<const BinaryArray&>(*type.dictionary());
+
+    // Check if values and output type match
+    DCHECK(values_type.Equals(*output->type))
+      << "Dictionary type: " << values_type
+      << " target type: " << (*output->type);
+
+    const Array& indices = *dict_array.indices();
+    switch (indices.type()->id()) {
+      case Type::INT8:
+        FUNC_RETURN_NOT_OK(
+            (UnpackBinaryDictionary<Int8Type>(ctx, indices, dictionary, output)));
+        break;
+      case Type::INT16:
+        FUNC_RETURN_NOT_OK(
+            (UnpackBinaryDictionary<Int16Type>(ctx, indices, dictionary, output)));
+        break;
+      case Type::INT32:
+        FUNC_RETURN_NOT_OK(
+            (UnpackBinaryDictionary<Int32Type>(ctx, indices, dictionary, output)));
+        break;
+      case Type::INT64:
+        FUNC_RETURN_NOT_OK(
+            (UnpackBinaryDictionary<Int64Type>(ctx, indices, dictionary, output)));
+        break;
+      default:
+        std::stringstream ss;
+        ss << "Invalid index type: " << indices.type()->ToString();
+        ctx->SetStatus(Status::Invalid(ss.str()));
+        return;
+    }
+  }
+};
+
+template <typename IndexType, typename c_type>
+void UnpackPrimitiveDictionary(const Array& indices, const c_type* dictionary,
+                               c_type* out) {
+  using index_c_type = typename IndexType::c_type;
+
+  const uint8_t* valid_bits = indices.null_bitmap_data();
+  INIT_BITSET(valid_bits, indices.offset());
+
+  const index_c_type* in =
+      reinterpret_cast<const index_c_type*>(indices.data()->buffers[1]->data()) +
+      indices.offset();
+  for (int64_t i = 0; i < indices.length(); ++i) {
+    if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
+      out[i] = dictionary[in[i]];
+    }
+    READ_NEXT_BITSET(valid_bits);
+  }
+}
+
+// Cast from dictionary to plain representation
+template <typename T>
+struct CastFunctor<T, DictionaryType,
+                   typename std::enable_if<IsNumeric<T>::value>::type> {
+  void operator()(FunctionContext* ctx, const CastOptions& options, const Array& input,
+                  ArrayData* output) {
+    using c_type = typename T::c_type;
+
+    const DictionaryArray& dict_array = static_cast<const DictionaryArray&>(input);
+    const DictionaryType& type = static_cast<const DictionaryType&>(*input.type());
+    const DataType& values_type = *type.dictionary()->type();
+
+    // Check if values and output type match
+    DCHECK(values_type.Equals(*output->type))
+      << "Dictionary type: " << values_type
+      << " target type: " << (*output->type);
+
+    auto dictionary =
+        reinterpret_cast<const c_type*>(type.dictionary()->data()->buffers[1]->data()) +
+        type.dictionary()->offset();
+    auto out = reinterpret_cast<c_type*>(output->buffers[1]->mutable_data());
+    const Array& indices = *dict_array.indices();
+    switch (indices.type()->id()) {
+      case Type::INT8:
+        UnpackPrimitiveDictionary<Int8Type, c_type>(indices, dictionary, out);
+        break;
+      case Type::INT16:
+        UnpackPrimitiveDictionary<Int16Type, c_type>(indices, dictionary, out);
+        break;
+      case Type::INT32:
+        UnpackPrimitiveDictionary<Int32Type, c_type>(indices, dictionary, out);
+        break;
+      case Type::INT64:
+        UnpackPrimitiveDictionary<Int64Type, c_type>(indices, dictionary, out);
+        break;
+      default:
+        std::stringstream ss;
+        ss << "Invalid index type: " << indices.type()->ToString();
+        ctx->SetStatus(Status::Invalid(ss.str()));
+        return;
+    }
+  }
+};
+
+// ----------------------------------------------------------------------
 
 typedef std::function<void(FunctionContext*, const CastOptions& options, const Array&,
                            ArrayData*)>
     CastFunction;
 
 static Status AllocateIfNotPreallocated(FunctionContext* ctx, const Array& input,
-                                        ArrayData* out) {
-  if (!is_primitive(out->type->id())) {
-    return Status::NotImplemented(out->type->ToString());
-  }
-
-  const auto& fw_type = static_cast<const FixedWidthType&>(*out->type);
-
+                                        bool can_pre_allocate_values, ArrayData* out) {
   const int64_t length = input.length();
 
   out->null_count = input.null_count();
@@ -261,35 +489,49 @@ static Status AllocateIfNotPreallocated(FunctionContext* ctx, const Array& input
 
   out->buffers.push_back(validity_bitmap);
 
-  std::shared_ptr<Buffer> out_data;
+  if (can_pre_allocate_values) {
+    std::shared_ptr<Buffer> out_data;
 
-  int bit_width = fw_type.bit_width();
-  int64_t buffer_size = 0;
+    if (!(is_primitive(out->type->id()) || out->type->id() == Type::FIXED_SIZE_BINARY)) {
+      std::stringstream ss;
+      ss << "Cannot pre-allocate memory for type: " << out->type->ToString();
+      return Status::NotImplemented(ss.str());
+    }
 
-  if (bit_width == 1) {
-    buffer_size = BitUtil::BytesForBits(length);
-  } else if (bit_width % 8 == 0) {
-    buffer_size = length * fw_type.bit_width() / 8;
-  } else {
-    DCHECK(false);
-  }
+    const auto& fw_type = static_cast<const FixedWidthType&>(*out->type);
+
+    int bit_width = fw_type.bit_width();
+    int64_t buffer_size = 0;
 
-  RETURN_NOT_OK(ctx->Allocate(buffer_size, &out_data));
-  memset(out_data->mutable_data(), 0, buffer_size);
+    if (bit_width == 1) {
+      buffer_size = BitUtil::BytesForBits(length);
+    } else if (bit_width % 8 == 0) {
+      buffer_size = length * fw_type.bit_width() / 8;
+    } else {
+      DCHECK(false);
+    }
 
-  out->buffers.push_back(out_data);
+    RETURN_NOT_OK(ctx->Allocate(buffer_size, &out_data));
+    memset(out_data->mutable_data(), 0, buffer_size);
+
+    out->buffers.push_back(out_data);
+  }
 
   return Status::OK();
 }
 
 class CastKernel : public UnaryKernel {
  public:
-  CastKernel(const CastOptions& options, const CastFunction& func, bool is_zero_copy)
-      : options_(options), func_(func), is_zero_copy_(is_zero_copy) {}
+  CastKernel(const CastOptions& options, const CastFunction& func, bool is_zero_copy,
+             bool can_pre_allocate_values)
+      : options_(options),
+        func_(func),
+        is_zero_copy_(is_zero_copy),
+        can_pre_allocate_values_(can_pre_allocate_values) {}
 
   Status Call(FunctionContext* ctx, const Array& input, ArrayData* out) override {
     if (!is_zero_copy_) {
-      RETURN_NOT_OK(AllocateIfNotPreallocated(ctx, input, out));
+      RETURN_NOT_OK(AllocateIfNotPreallocated(ctx, input, can_pre_allocate_values_, out));
     }
     func_(ctx, options_, input, out);
     RETURN_IF_ERROR(ctx);
@@ -300,11 +542,14 @@ class CastKernel : public UnaryKernel {
   CastOptions options_;
   CastFunction func_;
   bool is_zero_copy_;
+  bool can_pre_allocate_values_;
 };
 
 #define CAST_CASE(InType, OutType)                                                  \
   case OutType::type_id:                                                            \
     is_zero_copy = is_zero_copy_cast<OutType, InType>::value;                       \
+    can_pre_allocate_values =                                                       \
+        !(!is_binary_like(InType::type_id) && is_binary_like(OutType::type_id));    \
     func = [](FunctionContext* ctx, const CastOptions& options, const Array& input, \
               ArrayData* out) {                                                     \
       CastFunctor<OutType, InType> func;                                            \
@@ -354,20 +599,42 @@ class CastKernel : public UnaryKernel {
 
 #define TIMESTAMP_CASES(FN, IN_TYPE) FN(TimestampType, TimestampType);
 
-#define GET_CAST_FUNCTION(CASE_GENERATOR, InType)                                       \
-  static std::unique_ptr<UnaryKernel> Get##InType##CastFunc(                            \
-      const std::shared_ptr<DataType>& out_type, const CastOptions& options) {          \
-    CastFunction func;                                                                  \
-    bool is_zero_copy = false;                                                          \
-    switch (out_type->id()) {                                                           \
-      CASE_GENERATOR(CAST_CASE, InType);                                                \
-      default:                                                                          \
-        break;                                                                          \
-    }                                                                                   \
-    if (func != nullptr) {                                                              \
-      return std::unique_ptr<UnaryKernel>(new CastKernel(options, func, is_zero_copy)); \
-    }                                                                                   \
-    return nullptr;                                                                     \
+#define DICTIONARY_CASES(FN, IN_TYPE) \
+  FN(IN_TYPE, Time32Type);            \
+  FN(IN_TYPE, Date32Type);            \
+  FN(IN_TYPE, TimestampType);         \
+  FN(IN_TYPE, Time64Type);            \
+  FN(IN_TYPE, Date64Type);            \
+  FN(IN_TYPE, UInt8Type);             \
+  FN(IN_TYPE, Int8Type);              \
+  FN(IN_TYPE, UInt16Type);            \
+  FN(IN_TYPE, Int16Type);             \
+  FN(IN_TYPE, UInt32Type);            \
+  FN(IN_TYPE, Int32Type);             \
+  FN(IN_TYPE, UInt64Type);            \
+  FN(IN_TYPE, Int64Type);             \
+  FN(IN_TYPE, FloatType);             \
+  FN(IN_TYPE, DoubleType);            \
+  FN(IN_TYPE, FixedSizeBinaryType);   \
+  FN(IN_TYPE, BinaryType);            \
+  FN(IN_TYPE, StringType);
+
+#define GET_CAST_FUNCTION(CASE_GENERATOR, InType)                                \
+  static std::unique_ptr<UnaryKernel> Get##InType##CastFunc(                     \
+      const std::shared_ptr<DataType>& out_type, const CastOptions& options) {   \
+    CastFunction func;                                                           \
+    bool is_zero_copy = false;                                                   \
+    bool can_pre_allocate_values = true;                                         \
+    switch (out_type->id()) {                                                    \
+      CASE_GENERATOR(CAST_CASE, InType);                                         \
+      default:                                                                   \
+        break;                                                                   \
+    }                                                                            \
+    if (func != nullptr) {                                                       \
+      return std::unique_ptr<UnaryKernel>(                                       \
+          new CastKernel(options, func, is_zero_copy, can_pre_allocate_values)); \
+    }                                                                            \
+    return nullptr;                                                              \
   }
 
 GET_CAST_FUNCTION(NULL_CASES, NullType);
@@ -388,6 +655,8 @@ GET_CAST_FUNCTION(TIME32_CASES, Time32Type);
 GET_CAST_FUNCTION(TIME64_CASES, Time64Type);
 GET_CAST_FUNCTION(TIMESTAMP_CASES, TimestampType);
 
+GET_CAST_FUNCTION(DICTIONARY_CASES, DictionaryType);
+
 #define CAST_FUNCTION_CASE(InType)                      \
   case InType::type_id:                                 \
     *kernel = Get##InType##CastFunc(out_type, options); \
@@ -413,6 +682,7 @@ Status GetCastFunction(const DataType& in_type, const std::shared_ptr<DataType>&
     CAST_FUNCTION_CASE(Time32Type);
     CAST_FUNCTION_CASE(Time64Type);
     CAST_FUNCTION_CASE(TimestampType);
+    CAST_FUNCTION_CASE(DictionaryType);
     default:
       break;
   }

http://git-wip-us.apache.org/repos/asf/arrow/blob/7e957642/cpp/src/arrow/compute/compute-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/compute/compute-test.cc b/cpp/src/arrow/compute/compute-test.cc
index ba645c2..5898aee 100644
--- a/cpp/src/arrow/compute/compute-test.cc
+++ b/cpp/src/arrow/compute/compute-test.cc
@@ -66,10 +66,9 @@ void AssertArraysEqual(const Array& left, const Array& right) {
 
 class ComputeFixture {
  public:
-  ComputeFixture() : pool_(default_memory_pool()), ctx_(pool_) {}
+  ComputeFixture() : ctx_(default_memory_pool()) {}
 
  protected:
-  MemoryPool* pool_;
   FunctionContext ctx_;
 };
 
@@ -81,7 +80,7 @@ static void AssertBufferSame(const Array& left, const Array& right, int buffer_i
             right.data()->buffers[buffer_index].get());
 }
 
-class TestCast : public ComputeFixture, public ::testing::Test {
+class TestCast : public ComputeFixture, public TestBase {
  public:
   void CheckPass(const Array& input, const Array& expected,
                  const std::shared_ptr<DataType>& out_type, const CastOptions& options) {
@@ -395,5 +394,26 @@ TEST_F(TestCast, PreallocatedMemory) {
   AssertArraysEqual(*expected, *result);
 }
 
+template <typename TestType>
+class TestDictionaryCast : public TestCast {};
+
+typedef ::testing::Types<UInt8Type, Int8Type, UInt16Type, Int16Type, Int32Type,
+                         UInt32Type, UInt64Type, Int64Type, FloatType, DoubleType,
+                         Date32Type, Date64Type, FixedSizeBinaryType, BinaryType>
+    TestTypes;
+
+TYPED_TEST_CASE(TestDictionaryCast, TestTypes);
+
+TYPED_TEST(TestDictionaryCast, Basic) {
+  CastOptions options;
+  std::shared_ptr<Array> plain_array =
+      TestBase::MakeRandomArray<typename TypeTraits<TypeParam>::ArrayType>(10, 2);
+
+  std::shared_ptr<Array> dict_array;
+  ASSERT_OK(EncodeArrayToDictionary(*plain_array, this->pool_, &dict_array));
+
+  this->CheckPass(*dict_array, *plain_array, plain_array->type(), options);
+}
+
 }  // namespace compute
 }  // namespace arrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/7e957642/cpp/src/arrow/table-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/table-test.cc b/cpp/src/arrow/table-test.cc
index a9c7e6d..b0aeed1 100644
--- a/cpp/src/arrow/table-test.cc
+++ b/cpp/src/arrow/table-test.cc
@@ -128,9 +128,9 @@ class TestColumn : public TestChunkedArray {
 
 TEST_F(TestColumn, BasicAPI) {
   ArrayVector arrays;
-  arrays.push_back(MakePrimitive<Int32Array>(100));
-  arrays.push_back(MakePrimitive<Int32Array>(100, 10));
-  arrays.push_back(MakePrimitive<Int32Array>(100, 20));
+  arrays.push_back(MakeRandomArray<Int32Array>(100));
+  arrays.push_back(MakeRandomArray<Int32Array>(100, 10));
+  arrays.push_back(MakeRandomArray<Int32Array>(100, 20));
 
   auto f0 = field("c0", int32());
   column_.reset(new Column(f0, arrays));
@@ -148,15 +148,15 @@ TEST_F(TestColumn, BasicAPI) {
 
 TEST_F(TestColumn, ChunksInhomogeneous) {
   ArrayVector arrays;
-  arrays.push_back(MakePrimitive<Int32Array>(100));
-  arrays.push_back(MakePrimitive<Int32Array>(100, 10));
+  arrays.push_back(MakeRandomArray<Int32Array>(100));
+  arrays.push_back(MakeRandomArray<Int32Array>(100, 10));
 
   auto f0 = field("c0", int32());
   column_.reset(new Column(f0, arrays));
 
   ASSERT_OK(column_->ValidateData());
 
-  arrays.push_back(MakePrimitive<Int16Array>(100, 10));
+  arrays.push_back(MakeRandomArray<Int16Array>(100, 10));
   column_.reset(new Column(f0, arrays));
   ASSERT_RAISES(Invalid, column_->ValidateData());
 }
@@ -202,8 +202,8 @@ class TestTable : public TestBase {
     vector<shared_ptr<Field>> fields = {f0, f1, f2};
     schema_ = std::make_shared<Schema>(fields);
 
-    arrays_ = {MakePrimitive<Int32Array>(length), MakePrimitive<UInt8Array>(length),
-               MakePrimitive<Int16Array>(length)};
+    arrays_ = {MakeRandomArray<Int32Array>(length), MakeRandomArray<UInt8Array>(length),
+               MakeRandomArray<Int16Array>(length)};
 
     columns_ = {std::make_shared<Column>(schema_->field(0), arrays_[0]),
                 std::make_shared<Column>(schema_->field(1), arrays_[1]),
@@ -276,9 +276,10 @@ TEST_F(TestTable, InvalidColumns) {
   ASSERT_RAISES(Invalid, table_->ValidateColumns());
 
   columns_ = {
-      std::make_shared<Column>(schema_->field(0), MakePrimitive<Int32Array>(length)),
-      std::make_shared<Column>(schema_->field(1), MakePrimitive<UInt8Array>(length)),
-      std::make_shared<Column>(schema_->field(2), MakePrimitive<Int16Array>(length - 1))};
+      std::make_shared<Column>(schema_->field(0), MakeRandomArray<Int32Array>(length)),
+      std::make_shared<Column>(schema_->field(1), MakeRandomArray<UInt8Array>(length)),
+      std::make_shared<Column>(schema_->field(2),
+                               MakeRandomArray<Int16Array>(length - 1))};
 
   table_.reset(new Table(schema_, columns_, length));
   ASSERT_RAISES(Invalid, table_->ValidateColumns());
@@ -300,9 +301,12 @@ TEST_F(TestTable, Equals) {
   ASSERT_FALSE(table_->Equals(Table(other_schema, columns_)));
   // Differing columns
   std::vector<std::shared_ptr<Column>> other_columns = {
-      std::make_shared<Column>(schema_->field(0), MakePrimitive<Int32Array>(length, 10)),
-      std::make_shared<Column>(schema_->field(1), MakePrimitive<UInt8Array>(length, 10)),
-      std::make_shared<Column>(schema_->field(2), MakePrimitive<Int16Array>(length, 10))};
+      std::make_shared<Column>(schema_->field(0),
+                               MakeRandomArray<Int32Array>(length, 10)),
+      std::make_shared<Column>(schema_->field(1),
+                               MakeRandomArray<UInt8Array>(length, 10)),
+      std::make_shared<Column>(schema_->field(2),
+                               MakeRandomArray<Int16Array>(length, 10))};
   ASSERT_FALSE(table_->Equals(Table(schema_, other_columns)));
 }
 
@@ -410,8 +414,8 @@ TEST_F(TestTable, AddColumn) {
   ASSERT_TRUE(status.IsInvalid());
 
   // Add column with wrong length
-  auto longer_col =
-      std::make_shared<Column>(schema_->field(0), MakePrimitive<Int32Array>(length + 1));
+  auto longer_col = std::make_shared<Column>(schema_->field(0),
+                                             MakeRandomArray<Int32Array>(length + 1));
   status = table.AddColumn(0, longer_col, &result);
   ASSERT_TRUE(status.IsInvalid());
 
@@ -445,8 +449,8 @@ TEST_F(TestTable, AddColumn) {
 TEST_F(TestTable, IsChunked) {
   ArrayVector c1, c2;
 
-  auto a1 = MakePrimitive<Int32Array>(10);
-  auto a2 = MakePrimitive<Int32Array>(20);
+  auto a1 = MakeRandomArray<Int32Array>(10);
+  auto a2 = MakeRandomArray<Int32Array>(20);
 
   auto sch1 = arrow::schema({field("f1", int32()), field("f2", int32())});
 
@@ -477,9 +481,9 @@ TEST_F(TestRecordBatch, Equals) {
   vector<shared_ptr<Field>> fields = {f0, f1, f2};
   auto schema = std::make_shared<Schema>(fields);
 
-  auto a0 = MakePrimitive<Int32Array>(length);
-  auto a1 = MakePrimitive<UInt8Array>(length);
-  auto a2 = MakePrimitive<Int16Array>(length);
+  auto a0 = MakeRandomArray<Int32Array>(length);
+  auto a1 = MakeRandomArray<UInt8Array>(length);
+  auto a2 = MakeRandomArray<Int16Array>(length);
 
   RecordBatch b1(schema, length, {a0, a1, a2});
   RecordBatch b3(schema, length, {a0, a1});
@@ -502,10 +506,10 @@ TEST_F(TestRecordBatch, Validate) {
 
   auto schema = ::arrow::schema({f0, f1, f2});
 
-  auto a0 = MakePrimitive<Int32Array>(length);
-  auto a1 = MakePrimitive<UInt8Array>(length);
-  auto a2 = MakePrimitive<Int16Array>(length);
-  auto a3 = MakePrimitive<Int16Array>(5);
+  auto a0 = MakeRandomArray<Int32Array>(length);
+  auto a1 = MakeRandomArray<UInt8Array>(length);
+  auto a2 = MakeRandomArray<Int16Array>(length);
+  auto a3 = MakeRandomArray<Int16Array>(5);
 
   RecordBatch b1(schema, length, {a0, a1, a2});
 
@@ -531,8 +535,8 @@ TEST_F(TestRecordBatch, Slice) {
   vector<shared_ptr<Field>> fields = {f0, f1};
   auto schema = std::make_shared<Schema>(fields);
 
-  auto a0 = MakePrimitive<Int32Array>(length);
-  auto a1 = MakePrimitive<UInt8Array>(length);
+  auto a0 = MakeRandomArray<Int32Array>(length);
+  auto a1 = MakeRandomArray<UInt8Array>(length);
 
   RecordBatch batch(schema, length, {a0, a1});
 
@@ -555,10 +559,10 @@ class TestTableBatchReader : public TestBase {};
 TEST_F(TestTableBatchReader, ReadNext) {
   ArrayVector c1, c2;
 
-  auto a1 = MakePrimitive<Int32Array>(10);
-  auto a2 = MakePrimitive<Int32Array>(20);
-  auto a3 = MakePrimitive<Int32Array>(30);
-  auto a4 = MakePrimitive<Int32Array>(10);
+  auto a1 = MakeRandomArray<Int32Array>(10);
+  auto a2 = MakeRandomArray<Int32Array>(20);
+  auto a3 = MakeRandomArray<Int32Array>(30);
+  auto a4 = MakeRandomArray<Int32Array>(10);
 
   auto sch1 = arrow::schema({field("f1", int32()), field("f2", int32())});
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/7e957642/cpp/src/arrow/test-common.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/test-common.h b/cpp/src/arrow/test-common.h
index 4ce0640..3dc39fc 100644
--- a/cpp/src/arrow/test-common.h
+++ b/cpp/src/arrow/test-common.h
@@ -42,15 +42,7 @@ class TestBase : public ::testing::Test {
     random_seed_ = 0;
   }
 
-  template <typename ArrayType>
-  std::shared_ptr<Array> MakePrimitive(int64_t length, int64_t null_count = 0) {
-    auto data = std::make_shared<PoolBuffer>(pool_);
-    const int64_t data_nbytes = length * sizeof(typename ArrayType::value_type);
-    EXPECT_OK(data->Resize(data_nbytes));
-
-    // Fill with random data
-    test::random_bytes(data_nbytes, random_seed_++, data->mutable_data());
-
+  std::shared_ptr<Buffer> MakeRandomNullBitmap(int64_t length, int64_t null_count) {
     auto null_bitmap = std::make_shared<PoolBuffer>(pool_);
     const int64_t null_nbytes = BitUtil::BytesForBits(length);
     EXPECT_OK(null_bitmap->Resize(null_nbytes));
@@ -58,14 +50,68 @@ class TestBase : public ::testing::Test {
     for (int64_t i = 0; i < null_count; i++) {
       BitUtil::ClearBit(null_bitmap->mutable_data(), i * (length / null_count));
     }
-    return std::make_shared<ArrayType>(length, data, null_bitmap, null_count);
+    return null_bitmap;
   }
 
+  template <typename ArrayType>
+  std::shared_ptr<Array> MakeRandomArray(int64_t length, int64_t null_count = 0);
+
  protected:
   uint32_t random_seed_;
   MemoryPool* pool_;
 };
 
+template <typename ArrayType>
+std::shared_ptr<Array> TestBase::MakeRandomArray(int64_t length, int64_t null_count) {
+  auto data = std::make_shared<PoolBuffer>(pool_);
+  const int64_t data_nbytes = length * sizeof(typename ArrayType::value_type);
+  EXPECT_OK(data->Resize(data_nbytes));
+
+  // Fill with random data
+  test::random_bytes(data_nbytes, random_seed_++, data->mutable_data());
+  std::shared_ptr<Buffer> null_bitmap = MakeRandomNullBitmap(length, null_count);
+
+  return std::make_shared<ArrayType>(length, data, null_bitmap, null_count);
+}
+
+template <>
+std::shared_ptr<Array> TestBase::MakeRandomArray<FixedSizeBinaryArray>(
+    int64_t length, int64_t null_count) {
+  const int byte_width = 10;
+  std::shared_ptr<Buffer> null_bitmap = MakeRandomNullBitmap(length, null_count);
+  std::shared_ptr<PoolBuffer> data = std::make_shared<PoolBuffer>(pool_);
+
+  EXPECT_OK(data->Resize(byte_width * length));
+  ::arrow::test::random_bytes(data->size(), 0, data->mutable_data());
+  return std::make_shared<FixedSizeBinaryArray>(fixed_size_binary(byte_width), length,
+                                                data, null_bitmap, null_count);
+}
+
+template <>
+std::shared_ptr<Array> TestBase::MakeRandomArray<BinaryArray>(int64_t length,
+                                                              int64_t null_count) {
+  std::vector<uint8_t> valid_bytes(length, 1);
+  for (int64_t i = 0; i < null_count; i++) {
+    valid_bytes[i * 2] = 0;
+  }
+  BinaryBuilder builder(pool_);
+
+  const int kBufferSize = 10;
+  uint8_t buffer[kBufferSize];
+  for (int64_t i = 0; i < length; i++) {
+    if (!valid_bytes[i]) {
+      EXPECT_OK(builder.AppendNull());
+    } else {
+      ::arrow::test::random_bytes(kBufferSize, static_cast<uint32_t>(i), buffer);
+      EXPECT_OK(builder.Append(buffer, kBufferSize));
+    }
+  }
+
+  std::shared_ptr<Array> out;
+  EXPECT_OK(builder.Finish(&out));
+  return out;
+}
+
 class TestBuilder : public ::testing::Test {
  public:
   void SetUp() {

http://git-wip-us.apache.org/repos/asf/arrow/blob/7e957642/cpp/src/arrow/type.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 283e27e..7630f48 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -169,6 +169,11 @@ class ARROW_EXPORT DataType {
   ARROW_DISALLOW_COPY_AND_ASSIGN(DataType);
 };
 
+inline std::ostream& operator<<(std::ostream& os, const DataType& type) {
+  os << type.ToString();
+  return os;
+}
+
 // TODO(wesm): Remove this from parquet-cpp
 using TypePtr = std::shared_ptr<DataType>;
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/7e957642/cpp/src/arrow/util/bit-util.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h
index aa83746..b8a8efa 100644
--- a/cpp/src/arrow/util/bit-util.h
+++ b/cpp/src/arrow/util/bit-util.h
@@ -49,9 +49,9 @@
 
 namespace arrow {
 
-#define INIT_BITSET(valid_bits_vector, valid_bits_index)        \
-  int byte_offset_##valid_bits_vector = (valid_bits_index) / 8; \
-  int bit_offset_##valid_bits_vector = (valid_bits_index) % 8;  \
+#define INIT_BITSET(valid_bits_vector, valid_bits_index)            \
+  int64_t byte_offset_##valid_bits_vector = (valid_bits_index) / 8; \
+  int64_t bit_offset_##valid_bits_vector = (valid_bits_index) % 8;  \
   uint8_t bitset_##valid_bits_vector = valid_bits_vector[byte_offset_##valid_bits_vector];
 
 #define READ_NEXT_BITSET(valid_bits_vector)                                          \