You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/09/11 17:12:40 UTC
arrow git commit: ARROW-1513: C++: Add cast from Dictionary to plain
arrays [Forced Update!]
Repository: arrow
Updated Branches:
refs/heads/master a1237a28f -> 7e9576426 (forced update)
ARROW-1513: C++: Add cast from Dictionary to plain arrays
Author: Wes McKinney <we...@twosigma.com>
Author: Uwe L. Korn <uw...@xhochy.com>
Closes #1086 from xhochy/ARROW-1513 and squashes the following commits:
769b205e [Wes McKinney] Fix compiler error / warning. Change type checks to DCHECK
05c539f2 [Uwe L. Korn] ARROW-1513: C++: Add cast from Dictionary to plain arrays
Change-Id: I80d444e7a0661d58909801004ac9818dece04a15
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/7e957642
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/7e957642
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/7e957642
Branch: refs/heads/master
Commit: 7e95764263c34a3bfabf0ca9909f425c16042141
Parents: 94f6247
Author: Uwe L. Korn <uw...@xhochy.com>
Authored: Mon Sep 11 13:11:37 2017 -0400
Committer: Wes McKinney <we...@twosigma.com>
Committed: Mon Sep 11 13:12:24 2017 -0400
----------------------------------------------------------------------
cpp/src/arrow/compute/cast.cc | 344 +++++++++++++++++++++++++----
cpp/src/arrow/compute/compute-test.cc | 26 ++-
cpp/src/arrow/table-test.cc | 66 +++---
cpp/src/arrow/test-common.h | 66 +++++-
cpp/src/arrow/type.h | 5 +
cpp/src/arrow/util/bit-util.h | 6 +-
6 files changed, 429 insertions(+), 84 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/7e957642/cpp/src/arrow/compute/cast.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/compute/cast.cc b/cpp/src/arrow/compute/cast.cc
index 3885fdf..c651244 100644
--- a/cpp/src/arrow/compute/cast.cc
+++ b/cpp/src/arrow/compute/cast.cc
@@ -28,6 +28,8 @@
#include "arrow/array.h"
#include "arrow/buffer.h"
+#include "arrow/builder.h"
+#include "arrow/compare.h"
#include "arrow/type.h"
#include "arrow/type_traits.h"
#include "arrow/util/bit-util.h"
@@ -37,6 +39,32 @@
#include "arrow/compute/context.h"
#include "arrow/compute/kernel.h"
+#ifdef ARROW_EXTRA_ERROR_CONTEXT
+
+#define FUNC_RETURN_NOT_OK(s) \
+ do { \
+ Status _s = (s); \
+ if (ARROW_PREDICT_FALSE(!_s.ok())) { \
+ std::stringstream ss; \
+ ss << __FILE__ << ":" << __LINE__ << " code: " << #s << "\n" << _s.message(); \
+ ctx->SetStatus(Status(_s.code(), ss.str())); \
+ return; \
+ } \
+ } while (0)
+
+#else
+
+#define FUNC_RETURN_NOT_OK(s) \
+ do { \
+ Status _s = (s); \
+ if (ARROW_PREDICT_FALSE(!_s.ok())) { \
+ ctx->SetStatus(_s); \
+ return; \
+ } \
+ } while (0)
+
+#endif // ARROW_EXTRA_ERROR_CONTEXT
+
namespace arrow {
namespace compute {
@@ -226,19 +254,219 @@ struct CastFunctor<O, I,
};
// ----------------------------------------------------------------------
+// Dictionary to other things
+
+template <typename IndexType>
+void UnpackFixedSizeBinaryDictionary(FunctionContext* ctx, const Array& indices,
+ const FixedSizeBinaryArray& dictionary,
+ ArrayData* output) {
+ using index_c_type = typename IndexType::c_type;
+ const uint8_t* valid_bits = indices.null_bitmap_data();
+ INIT_BITSET(valid_bits, indices.offset());
+
+ const index_c_type* in =
+ reinterpret_cast<const index_c_type*>(indices.data()->buffers[1]->data()) +
+ indices.offset();
+ uint8_t* out = output->buffers[1]->mutable_data();
+ int32_t byte_width =
+ static_cast<const FixedSizeBinaryType&>(*output->type).byte_width();
+ for (int64_t i = 0; i < indices.length(); ++i) {
+ if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
+ const uint8_t* value = dictionary.Value(in[i]);
+ memcpy(out + i * byte_width, value, byte_width);
+ }
+ READ_NEXT_BITSET(valid_bits);
+ }
+}
+
+template <typename T>
+struct CastFunctor<
+ T, DictionaryType,
+ typename std::enable_if<std::is_base_of<FixedSizeBinaryType, T>::value>::type> {
+ void operator()(FunctionContext* ctx, const CastOptions& options, const Array& input,
+ ArrayData* output) {
+ const DictionaryArray& dict_array = static_cast<const DictionaryArray&>(input);
+ const DictionaryType& type = static_cast<const DictionaryType&>(*input.type());
+ const DataType& values_type = *type.dictionary()->type();
+ const FixedSizeBinaryArray& dictionary =
+ static_cast<const FixedSizeBinaryArray&>(*type.dictionary());
+
+ // Check if values and output type match
+ DCHECK(values_type.Equals(*output->type))
+ << "Dictionary type: " << values_type
+ << " target type: " << (*output->type);
+
+ const Array& indices = *dict_array.indices();
+ switch (indices.type()->id()) {
+ case Type::INT8:
+ UnpackFixedSizeBinaryDictionary<Int8Type>(ctx, indices, dictionary, output);
+ break;
+ case Type::INT16:
+ UnpackFixedSizeBinaryDictionary<Int16Type>(ctx, indices, dictionary, output);
+ break;
+ case Type::INT32:
+ UnpackFixedSizeBinaryDictionary<Int32Type>(ctx, indices, dictionary, output);
+ break;
+ case Type::INT64:
+ UnpackFixedSizeBinaryDictionary<Int64Type>(ctx, indices, dictionary, output);
+ break;
+ default:
+ std::stringstream ss;
+ ss << "Invalid index type: " << indices.type()->ToString();
+ ctx->SetStatus(Status::Invalid(ss.str()));
+ return;
+ }
+ }
+};
+
+template <typename IndexType>
+Status UnpackBinaryDictionary(FunctionContext* ctx, const Array& indices,
+ const BinaryArray& dictionary, ArrayData* output) {
+ using index_c_type = typename IndexType::c_type;
+ std::unique_ptr<ArrayBuilder> builder;
+ RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), output->type, &builder));
+ BinaryBuilder* binary_builder = static_cast<BinaryBuilder*>(builder.get());
+
+ const uint8_t* valid_bits = indices.null_bitmap_data();
+ INIT_BITSET(valid_bits, indices.offset());
+
+ const index_c_type* in =
+ reinterpret_cast<const index_c_type*>(indices.data()->buffers[1]->data()) +
+ indices.offset();
+ for (int64_t i = 0; i < indices.length(); ++i) {
+ if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
+ int32_t length;
+ const uint8_t* value = dictionary.GetValue(in[i], &length);
+ RETURN_NOT_OK(binary_builder->Append(value, length));
+ } else {
+ RETURN_NOT_OK(binary_builder->AppendNull());
+ }
+ READ_NEXT_BITSET(valid_bits);
+ }
+
+ std::shared_ptr<Array> plain_array;
+ RETURN_NOT_OK(binary_builder->Finish(&plain_array));
+ // Copy all buffer except the valid bitmap
+ for (size_t i = 1; i < plain_array->data()->buffers.size(); i++) {
+ output->buffers.push_back(plain_array->data()->buffers[i]);
+ }
+
+ return Status::OK();
+}
+
+template <typename T>
+struct CastFunctor<T, DictionaryType,
+ typename std::enable_if<std::is_base_of<BinaryType, T>::value>::type> {
+ void operator()(FunctionContext* ctx, const CastOptions& options, const Array& input,
+ ArrayData* output) {
+ const DictionaryArray& dict_array = static_cast<const DictionaryArray&>(input);
+ const DictionaryType& type = static_cast<const DictionaryType&>(*input.type());
+ const DataType& values_type = *type.dictionary()->type();
+ const BinaryArray& dictionary = static_cast<const BinaryArray&>(*type.dictionary());
+
+ // Check if values and output type match
+ DCHECK(values_type.Equals(*output->type))
+ << "Dictionary type: " << values_type
+ << " target type: " << (*output->type);
+
+ const Array& indices = *dict_array.indices();
+ switch (indices.type()->id()) {
+ case Type::INT8:
+ FUNC_RETURN_NOT_OK(
+ (UnpackBinaryDictionary<Int8Type>(ctx, indices, dictionary, output)));
+ break;
+ case Type::INT16:
+ FUNC_RETURN_NOT_OK(
+ (UnpackBinaryDictionary<Int16Type>(ctx, indices, dictionary, output)));
+ break;
+ case Type::INT32:
+ FUNC_RETURN_NOT_OK(
+ (UnpackBinaryDictionary<Int32Type>(ctx, indices, dictionary, output)));
+ break;
+ case Type::INT64:
+ FUNC_RETURN_NOT_OK(
+ (UnpackBinaryDictionary<Int64Type>(ctx, indices, dictionary, output)));
+ break;
+ default:
+ std::stringstream ss;
+ ss << "Invalid index type: " << indices.type()->ToString();
+ ctx->SetStatus(Status::Invalid(ss.str()));
+ return;
+ }
+ }
+};
+
+template <typename IndexType, typename c_type>
+void UnpackPrimitiveDictionary(const Array& indices, const c_type* dictionary,
+ c_type* out) {
+ using index_c_type = typename IndexType::c_type;
+
+ const uint8_t* valid_bits = indices.null_bitmap_data();
+ INIT_BITSET(valid_bits, indices.offset());
+
+ const index_c_type* in =
+ reinterpret_cast<const index_c_type*>(indices.data()->buffers[1]->data()) +
+ indices.offset();
+ for (int64_t i = 0; i < indices.length(); ++i) {
+ if (bitset_valid_bits & (1 << bit_offset_valid_bits)) {
+ out[i] = dictionary[in[i]];
+ }
+ READ_NEXT_BITSET(valid_bits);
+ }
+}
+
+// Cast from dictionary to plain representation
+template <typename T>
+struct CastFunctor<T, DictionaryType,
+ typename std::enable_if<IsNumeric<T>::value>::type> {
+ void operator()(FunctionContext* ctx, const CastOptions& options, const Array& input,
+ ArrayData* output) {
+ using c_type = typename T::c_type;
+
+ const DictionaryArray& dict_array = static_cast<const DictionaryArray&>(input);
+ const DictionaryType& type = static_cast<const DictionaryType&>(*input.type());
+ const DataType& values_type = *type.dictionary()->type();
+
+ // Check if values and output type match
+ DCHECK(values_type.Equals(*output->type))
+ << "Dictionary type: " << values_type
+ << " target type: " << (*output->type);
+
+ auto dictionary =
+ reinterpret_cast<const c_type*>(type.dictionary()->data()->buffers[1]->data()) +
+ type.dictionary()->offset();
+ auto out = reinterpret_cast<c_type*>(output->buffers[1]->mutable_data());
+ const Array& indices = *dict_array.indices();
+ switch (indices.type()->id()) {
+ case Type::INT8:
+ UnpackPrimitiveDictionary<Int8Type, c_type>(indices, dictionary, out);
+ break;
+ case Type::INT16:
+ UnpackPrimitiveDictionary<Int16Type, c_type>(indices, dictionary, out);
+ break;
+ case Type::INT32:
+ UnpackPrimitiveDictionary<Int32Type, c_type>(indices, dictionary, out);
+ break;
+ case Type::INT64:
+ UnpackPrimitiveDictionary<Int64Type, c_type>(indices, dictionary, out);
+ break;
+ default:
+ std::stringstream ss;
+ ss << "Invalid index type: " << indices.type()->ToString();
+ ctx->SetStatus(Status::Invalid(ss.str()));
+ return;
+ }
+ }
+};
+
+// ----------------------------------------------------------------------
typedef std::function<void(FunctionContext*, const CastOptions& options, const Array&,
ArrayData*)>
CastFunction;
static Status AllocateIfNotPreallocated(FunctionContext* ctx, const Array& input,
- ArrayData* out) {
- if (!is_primitive(out->type->id())) {
- return Status::NotImplemented(out->type->ToString());
- }
-
- const auto& fw_type = static_cast<const FixedWidthType&>(*out->type);
-
+ bool can_pre_allocate_values, ArrayData* out) {
const int64_t length = input.length();
out->null_count = input.null_count();
@@ -261,35 +489,49 @@ static Status AllocateIfNotPreallocated(FunctionContext* ctx, const Array& input
out->buffers.push_back(validity_bitmap);
- std::shared_ptr<Buffer> out_data;
+ if (can_pre_allocate_values) {
+ std::shared_ptr<Buffer> out_data;
- int bit_width = fw_type.bit_width();
- int64_t buffer_size = 0;
+ if (!(is_primitive(out->type->id()) || out->type->id() == Type::FIXED_SIZE_BINARY)) {
+ std::stringstream ss;
+ ss << "Cannot pre-allocate memory for type: " << out->type->ToString();
+ return Status::NotImplemented(ss.str());
+ }
- if (bit_width == 1) {
- buffer_size = BitUtil::BytesForBits(length);
- } else if (bit_width % 8 == 0) {
- buffer_size = length * fw_type.bit_width() / 8;
- } else {
- DCHECK(false);
- }
+ const auto& fw_type = static_cast<const FixedWidthType&>(*out->type);
+
+ int bit_width = fw_type.bit_width();
+ int64_t buffer_size = 0;
- RETURN_NOT_OK(ctx->Allocate(buffer_size, &out_data));
- memset(out_data->mutable_data(), 0, buffer_size);
+ if (bit_width == 1) {
+ buffer_size = BitUtil::BytesForBits(length);
+ } else if (bit_width % 8 == 0) {
+ buffer_size = length * fw_type.bit_width() / 8;
+ } else {
+ DCHECK(false);
+ }
- out->buffers.push_back(out_data);
+ RETURN_NOT_OK(ctx->Allocate(buffer_size, &out_data));
+ memset(out_data->mutable_data(), 0, buffer_size);
+
+ out->buffers.push_back(out_data);
+ }
return Status::OK();
}
class CastKernel : public UnaryKernel {
public:
- CastKernel(const CastOptions& options, const CastFunction& func, bool is_zero_copy)
- : options_(options), func_(func), is_zero_copy_(is_zero_copy) {}
+ CastKernel(const CastOptions& options, const CastFunction& func, bool is_zero_copy,
+ bool can_pre_allocate_values)
+ : options_(options),
+ func_(func),
+ is_zero_copy_(is_zero_copy),
+ can_pre_allocate_values_(can_pre_allocate_values) {}
Status Call(FunctionContext* ctx, const Array& input, ArrayData* out) override {
if (!is_zero_copy_) {
- RETURN_NOT_OK(AllocateIfNotPreallocated(ctx, input, out));
+ RETURN_NOT_OK(AllocateIfNotPreallocated(ctx, input, can_pre_allocate_values_, out));
}
func_(ctx, options_, input, out);
RETURN_IF_ERROR(ctx);
@@ -300,11 +542,14 @@ class CastKernel : public UnaryKernel {
CastOptions options_;
CastFunction func_;
bool is_zero_copy_;
+ bool can_pre_allocate_values_;
};
#define CAST_CASE(InType, OutType) \
case OutType::type_id: \
is_zero_copy = is_zero_copy_cast<OutType, InType>::value; \
+ can_pre_allocate_values = \
+ !(!is_binary_like(InType::type_id) && is_binary_like(OutType::type_id)); \
func = [](FunctionContext* ctx, const CastOptions& options, const Array& input, \
ArrayData* out) { \
CastFunctor<OutType, InType> func; \
@@ -354,20 +599,42 @@ class CastKernel : public UnaryKernel {
#define TIMESTAMP_CASES(FN, IN_TYPE) FN(TimestampType, TimestampType);
-#define GET_CAST_FUNCTION(CASE_GENERATOR, InType) \
- static std::unique_ptr<UnaryKernel> Get##InType##CastFunc( \
- const std::shared_ptr<DataType>& out_type, const CastOptions& options) { \
- CastFunction func; \
- bool is_zero_copy = false; \
- switch (out_type->id()) { \
- CASE_GENERATOR(CAST_CASE, InType); \
- default: \
- break; \
- } \
- if (func != nullptr) { \
- return std::unique_ptr<UnaryKernel>(new CastKernel(options, func, is_zero_copy)); \
- } \
- return nullptr; \
+#define DICTIONARY_CASES(FN, IN_TYPE) \
+ FN(IN_TYPE, Time32Type); \
+ FN(IN_TYPE, Date32Type); \
+ FN(IN_TYPE, TimestampType); \
+ FN(IN_TYPE, Time64Type); \
+ FN(IN_TYPE, Date64Type); \
+ FN(IN_TYPE, UInt8Type); \
+ FN(IN_TYPE, Int8Type); \
+ FN(IN_TYPE, UInt16Type); \
+ FN(IN_TYPE, Int16Type); \
+ FN(IN_TYPE, UInt32Type); \
+ FN(IN_TYPE, Int32Type); \
+ FN(IN_TYPE, UInt64Type); \
+ FN(IN_TYPE, Int64Type); \
+ FN(IN_TYPE, FloatType); \
+ FN(IN_TYPE, DoubleType); \
+ FN(IN_TYPE, FixedSizeBinaryType); \
+ FN(IN_TYPE, BinaryType); \
+ FN(IN_TYPE, StringType);
+
+#define GET_CAST_FUNCTION(CASE_GENERATOR, InType) \
+ static std::unique_ptr<UnaryKernel> Get##InType##CastFunc( \
+ const std::shared_ptr<DataType>& out_type, const CastOptions& options) { \
+ CastFunction func; \
+ bool is_zero_copy = false; \
+ bool can_pre_allocate_values = true; \
+ switch (out_type->id()) { \
+ CASE_GENERATOR(CAST_CASE, InType); \
+ default: \
+ break; \
+ } \
+ if (func != nullptr) { \
+ return std::unique_ptr<UnaryKernel>( \
+ new CastKernel(options, func, is_zero_copy, can_pre_allocate_values)); \
+ } \
+ return nullptr; \
}
GET_CAST_FUNCTION(NULL_CASES, NullType);
@@ -388,6 +655,8 @@ GET_CAST_FUNCTION(TIME32_CASES, Time32Type);
GET_CAST_FUNCTION(TIME64_CASES, Time64Type);
GET_CAST_FUNCTION(TIMESTAMP_CASES, TimestampType);
+GET_CAST_FUNCTION(DICTIONARY_CASES, DictionaryType);
+
#define CAST_FUNCTION_CASE(InType) \
case InType::type_id: \
*kernel = Get##InType##CastFunc(out_type, options); \
@@ -413,6 +682,7 @@ Status GetCastFunction(const DataType& in_type, const std::shared_ptr<DataType>&
CAST_FUNCTION_CASE(Time32Type);
CAST_FUNCTION_CASE(Time64Type);
CAST_FUNCTION_CASE(TimestampType);
+ CAST_FUNCTION_CASE(DictionaryType);
default:
break;
}
http://git-wip-us.apache.org/repos/asf/arrow/blob/7e957642/cpp/src/arrow/compute/compute-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/compute/compute-test.cc b/cpp/src/arrow/compute/compute-test.cc
index ba645c2..5898aee 100644
--- a/cpp/src/arrow/compute/compute-test.cc
+++ b/cpp/src/arrow/compute/compute-test.cc
@@ -66,10 +66,9 @@ void AssertArraysEqual(const Array& left, const Array& right) {
class ComputeFixture {
public:
- ComputeFixture() : pool_(default_memory_pool()), ctx_(pool_) {}
+ ComputeFixture() : ctx_(default_memory_pool()) {}
protected:
- MemoryPool* pool_;
FunctionContext ctx_;
};
@@ -81,7 +80,7 @@ static void AssertBufferSame(const Array& left, const Array& right, int buffer_i
right.data()->buffers[buffer_index].get());
}
-class TestCast : public ComputeFixture, public ::testing::Test {
+class TestCast : public ComputeFixture, public TestBase {
public:
void CheckPass(const Array& input, const Array& expected,
const std::shared_ptr<DataType>& out_type, const CastOptions& options) {
@@ -395,5 +394,26 @@ TEST_F(TestCast, PreallocatedMemory) {
AssertArraysEqual(*expected, *result);
}
+template <typename TestType>
+class TestDictionaryCast : public TestCast {};
+
+typedef ::testing::Types<UInt8Type, Int8Type, UInt16Type, Int16Type, Int32Type,
+ UInt32Type, UInt64Type, Int64Type, FloatType, DoubleType,
+ Date32Type, Date64Type, FixedSizeBinaryType, BinaryType>
+ TestTypes;
+
+TYPED_TEST_CASE(TestDictionaryCast, TestTypes);
+
+TYPED_TEST(TestDictionaryCast, Basic) {
+ CastOptions options;
+ std::shared_ptr<Array> plain_array =
+ TestBase::MakeRandomArray<typename TypeTraits<TypeParam>::ArrayType>(10, 2);
+
+ std::shared_ptr<Array> dict_array;
+ ASSERT_OK(EncodeArrayToDictionary(*plain_array, this->pool_, &dict_array));
+
+ this->CheckPass(*dict_array, *plain_array, plain_array->type(), options);
+}
+
} // namespace compute
} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/7e957642/cpp/src/arrow/table-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/table-test.cc b/cpp/src/arrow/table-test.cc
index a9c7e6d..b0aeed1 100644
--- a/cpp/src/arrow/table-test.cc
+++ b/cpp/src/arrow/table-test.cc
@@ -128,9 +128,9 @@ class TestColumn : public TestChunkedArray {
TEST_F(TestColumn, BasicAPI) {
ArrayVector arrays;
- arrays.push_back(MakePrimitive<Int32Array>(100));
- arrays.push_back(MakePrimitive<Int32Array>(100, 10));
- arrays.push_back(MakePrimitive<Int32Array>(100, 20));
+ arrays.push_back(MakeRandomArray<Int32Array>(100));
+ arrays.push_back(MakeRandomArray<Int32Array>(100, 10));
+ arrays.push_back(MakeRandomArray<Int32Array>(100, 20));
auto f0 = field("c0", int32());
column_.reset(new Column(f0, arrays));
@@ -148,15 +148,15 @@ TEST_F(TestColumn, BasicAPI) {
TEST_F(TestColumn, ChunksInhomogeneous) {
ArrayVector arrays;
- arrays.push_back(MakePrimitive<Int32Array>(100));
- arrays.push_back(MakePrimitive<Int32Array>(100, 10));
+ arrays.push_back(MakeRandomArray<Int32Array>(100));
+ arrays.push_back(MakeRandomArray<Int32Array>(100, 10));
auto f0 = field("c0", int32());
column_.reset(new Column(f0, arrays));
ASSERT_OK(column_->ValidateData());
- arrays.push_back(MakePrimitive<Int16Array>(100, 10));
+ arrays.push_back(MakeRandomArray<Int16Array>(100, 10));
column_.reset(new Column(f0, arrays));
ASSERT_RAISES(Invalid, column_->ValidateData());
}
@@ -202,8 +202,8 @@ class TestTable : public TestBase {
vector<shared_ptr<Field>> fields = {f0, f1, f2};
schema_ = std::make_shared<Schema>(fields);
- arrays_ = {MakePrimitive<Int32Array>(length), MakePrimitive<UInt8Array>(length),
- MakePrimitive<Int16Array>(length)};
+ arrays_ = {MakeRandomArray<Int32Array>(length), MakeRandomArray<UInt8Array>(length),
+ MakeRandomArray<Int16Array>(length)};
columns_ = {std::make_shared<Column>(schema_->field(0), arrays_[0]),
std::make_shared<Column>(schema_->field(1), arrays_[1]),
@@ -276,9 +276,10 @@ TEST_F(TestTable, InvalidColumns) {
ASSERT_RAISES(Invalid, table_->ValidateColumns());
columns_ = {
- std::make_shared<Column>(schema_->field(0), MakePrimitive<Int32Array>(length)),
- std::make_shared<Column>(schema_->field(1), MakePrimitive<UInt8Array>(length)),
- std::make_shared<Column>(schema_->field(2), MakePrimitive<Int16Array>(length - 1))};
+ std::make_shared<Column>(schema_->field(0), MakeRandomArray<Int32Array>(length)),
+ std::make_shared<Column>(schema_->field(1), MakeRandomArray<UInt8Array>(length)),
+ std::make_shared<Column>(schema_->field(2),
+ MakeRandomArray<Int16Array>(length - 1))};
table_.reset(new Table(schema_, columns_, length));
ASSERT_RAISES(Invalid, table_->ValidateColumns());
@@ -300,9 +301,12 @@ TEST_F(TestTable, Equals) {
ASSERT_FALSE(table_->Equals(Table(other_schema, columns_)));
// Differing columns
std::vector<std::shared_ptr<Column>> other_columns = {
- std::make_shared<Column>(schema_->field(0), MakePrimitive<Int32Array>(length, 10)),
- std::make_shared<Column>(schema_->field(1), MakePrimitive<UInt8Array>(length, 10)),
- std::make_shared<Column>(schema_->field(2), MakePrimitive<Int16Array>(length, 10))};
+ std::make_shared<Column>(schema_->field(0),
+ MakeRandomArray<Int32Array>(length, 10)),
+ std::make_shared<Column>(schema_->field(1),
+ MakeRandomArray<UInt8Array>(length, 10)),
+ std::make_shared<Column>(schema_->field(2),
+ MakeRandomArray<Int16Array>(length, 10))};
ASSERT_FALSE(table_->Equals(Table(schema_, other_columns)));
}
@@ -410,8 +414,8 @@ TEST_F(TestTable, AddColumn) {
ASSERT_TRUE(status.IsInvalid());
// Add column with wrong length
- auto longer_col =
- std::make_shared<Column>(schema_->field(0), MakePrimitive<Int32Array>(length + 1));
+ auto longer_col = std::make_shared<Column>(schema_->field(0),
+ MakeRandomArray<Int32Array>(length + 1));
status = table.AddColumn(0, longer_col, &result);
ASSERT_TRUE(status.IsInvalid());
@@ -445,8 +449,8 @@ TEST_F(TestTable, AddColumn) {
TEST_F(TestTable, IsChunked) {
ArrayVector c1, c2;
- auto a1 = MakePrimitive<Int32Array>(10);
- auto a2 = MakePrimitive<Int32Array>(20);
+ auto a1 = MakeRandomArray<Int32Array>(10);
+ auto a2 = MakeRandomArray<Int32Array>(20);
auto sch1 = arrow::schema({field("f1", int32()), field("f2", int32())});
@@ -477,9 +481,9 @@ TEST_F(TestRecordBatch, Equals) {
vector<shared_ptr<Field>> fields = {f0, f1, f2};
auto schema = std::make_shared<Schema>(fields);
- auto a0 = MakePrimitive<Int32Array>(length);
- auto a1 = MakePrimitive<UInt8Array>(length);
- auto a2 = MakePrimitive<Int16Array>(length);
+ auto a0 = MakeRandomArray<Int32Array>(length);
+ auto a1 = MakeRandomArray<UInt8Array>(length);
+ auto a2 = MakeRandomArray<Int16Array>(length);
RecordBatch b1(schema, length, {a0, a1, a2});
RecordBatch b3(schema, length, {a0, a1});
@@ -502,10 +506,10 @@ TEST_F(TestRecordBatch, Validate) {
auto schema = ::arrow::schema({f0, f1, f2});
- auto a0 = MakePrimitive<Int32Array>(length);
- auto a1 = MakePrimitive<UInt8Array>(length);
- auto a2 = MakePrimitive<Int16Array>(length);
- auto a3 = MakePrimitive<Int16Array>(5);
+ auto a0 = MakeRandomArray<Int32Array>(length);
+ auto a1 = MakeRandomArray<UInt8Array>(length);
+ auto a2 = MakeRandomArray<Int16Array>(length);
+ auto a3 = MakeRandomArray<Int16Array>(5);
RecordBatch b1(schema, length, {a0, a1, a2});
@@ -531,8 +535,8 @@ TEST_F(TestRecordBatch, Slice) {
vector<shared_ptr<Field>> fields = {f0, f1};
auto schema = std::make_shared<Schema>(fields);
- auto a0 = MakePrimitive<Int32Array>(length);
- auto a1 = MakePrimitive<UInt8Array>(length);
+ auto a0 = MakeRandomArray<Int32Array>(length);
+ auto a1 = MakeRandomArray<UInt8Array>(length);
RecordBatch batch(schema, length, {a0, a1});
@@ -555,10 +559,10 @@ class TestTableBatchReader : public TestBase {};
TEST_F(TestTableBatchReader, ReadNext) {
ArrayVector c1, c2;
- auto a1 = MakePrimitive<Int32Array>(10);
- auto a2 = MakePrimitive<Int32Array>(20);
- auto a3 = MakePrimitive<Int32Array>(30);
- auto a4 = MakePrimitive<Int32Array>(10);
+ auto a1 = MakeRandomArray<Int32Array>(10);
+ auto a2 = MakeRandomArray<Int32Array>(20);
+ auto a3 = MakeRandomArray<Int32Array>(30);
+ auto a4 = MakeRandomArray<Int32Array>(10);
auto sch1 = arrow::schema({field("f1", int32()), field("f2", int32())});
http://git-wip-us.apache.org/repos/asf/arrow/blob/7e957642/cpp/src/arrow/test-common.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/test-common.h b/cpp/src/arrow/test-common.h
index 4ce0640..3dc39fc 100644
--- a/cpp/src/arrow/test-common.h
+++ b/cpp/src/arrow/test-common.h
@@ -42,15 +42,7 @@ class TestBase : public ::testing::Test {
random_seed_ = 0;
}
- template <typename ArrayType>
- std::shared_ptr<Array> MakePrimitive(int64_t length, int64_t null_count = 0) {
- auto data = std::make_shared<PoolBuffer>(pool_);
- const int64_t data_nbytes = length * sizeof(typename ArrayType::value_type);
- EXPECT_OK(data->Resize(data_nbytes));
-
- // Fill with random data
- test::random_bytes(data_nbytes, random_seed_++, data->mutable_data());
-
+ std::shared_ptr<Buffer> MakeRandomNullBitmap(int64_t length, int64_t null_count) {
auto null_bitmap = std::make_shared<PoolBuffer>(pool_);
const int64_t null_nbytes = BitUtil::BytesForBits(length);
EXPECT_OK(null_bitmap->Resize(null_nbytes));
@@ -58,14 +50,68 @@ class TestBase : public ::testing::Test {
for (int64_t i = 0; i < null_count; i++) {
BitUtil::ClearBit(null_bitmap->mutable_data(), i * (length / null_count));
}
- return std::make_shared<ArrayType>(length, data, null_bitmap, null_count);
+ return null_bitmap;
}
+ template <typename ArrayType>
+ std::shared_ptr<Array> MakeRandomArray(int64_t length, int64_t null_count = 0);
+
protected:
uint32_t random_seed_;
MemoryPool* pool_;
};
+template <typename ArrayType>
+std::shared_ptr<Array> TestBase::MakeRandomArray(int64_t length, int64_t null_count) {
+ auto data = std::make_shared<PoolBuffer>(pool_);
+ const int64_t data_nbytes = length * sizeof(typename ArrayType::value_type);
+ EXPECT_OK(data->Resize(data_nbytes));
+
+ // Fill with random data
+ test::random_bytes(data_nbytes, random_seed_++, data->mutable_data());
+ std::shared_ptr<Buffer> null_bitmap = MakeRandomNullBitmap(length, null_count);
+
+ return std::make_shared<ArrayType>(length, data, null_bitmap, null_count);
+}
+
+template <>
+std::shared_ptr<Array> TestBase::MakeRandomArray<FixedSizeBinaryArray>(
+ int64_t length, int64_t null_count) {
+ const int byte_width = 10;
+ std::shared_ptr<Buffer> null_bitmap = MakeRandomNullBitmap(length, null_count);
+ std::shared_ptr<PoolBuffer> data = std::make_shared<PoolBuffer>(pool_);
+
+ EXPECT_OK(data->Resize(byte_width * length));
+ ::arrow::test::random_bytes(data->size(), 0, data->mutable_data());
+ return std::make_shared<FixedSizeBinaryArray>(fixed_size_binary(byte_width), length,
+ data, null_bitmap, null_count);
+}
+
+template <>
+std::shared_ptr<Array> TestBase::MakeRandomArray<BinaryArray>(int64_t length,
+ int64_t null_count) {
+ std::vector<uint8_t> valid_bytes(length, 1);
+ for (int64_t i = 0; i < null_count; i++) {
+ valid_bytes[i * 2] = 0;
+ }
+ BinaryBuilder builder(pool_);
+
+ const int kBufferSize = 10;
+ uint8_t buffer[kBufferSize];
+ for (int64_t i = 0; i < length; i++) {
+ if (!valid_bytes[i]) {
+ EXPECT_OK(builder.AppendNull());
+ } else {
+ ::arrow::test::random_bytes(kBufferSize, static_cast<uint32_t>(i), buffer);
+ EXPECT_OK(builder.Append(buffer, kBufferSize));
+ }
+ }
+
+ std::shared_ptr<Array> out;
+ EXPECT_OK(builder.Finish(&out));
+ return out;
+}
+
class TestBuilder : public ::testing::Test {
public:
void SetUp() {
http://git-wip-us.apache.org/repos/asf/arrow/blob/7e957642/cpp/src/arrow/type.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 283e27e..7630f48 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -169,6 +169,11 @@ class ARROW_EXPORT DataType {
ARROW_DISALLOW_COPY_AND_ASSIGN(DataType);
};
+inline std::ostream& operator<<(std::ostream& os, const DataType& type) {
+ os << type.ToString();
+ return os;
+}
+
// TODO(wesm): Remove this from parquet-cpp
using TypePtr = std::shared_ptr<DataType>;
http://git-wip-us.apache.org/repos/asf/arrow/blob/7e957642/cpp/src/arrow/util/bit-util.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h
index aa83746..b8a8efa 100644
--- a/cpp/src/arrow/util/bit-util.h
+++ b/cpp/src/arrow/util/bit-util.h
@@ -49,9 +49,9 @@
namespace arrow {
-#define INIT_BITSET(valid_bits_vector, valid_bits_index) \
- int byte_offset_##valid_bits_vector = (valid_bits_index) / 8; \
- int bit_offset_##valid_bits_vector = (valid_bits_index) % 8; \
+#define INIT_BITSET(valid_bits_vector, valid_bits_index) \
+ int64_t byte_offset_##valid_bits_vector = (valid_bits_index) / 8; \
+ int64_t bit_offset_##valid_bits_vector = (valid_bits_index) % 8; \
uint8_t bitset_##valid_bits_vector = valid_bits_vector[byte_offset_##valid_bits_vector];
#define READ_NEXT_BITSET(valid_bits_vector) \