You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2017/02/06 16:25:36 UTC
[3/3] arrow git commit: ARROW-33: [C++] Implement zero-copy array
slicing, integrate with IPC code paths
ARROW-33: [C++] Implement zero-copy array slicing, integrate with IPC code paths
This turned into a bit of a refactoring bloodbath. I have sorted through most of the issues that this turned up, so I should have this all completely working within a day or so. There will be some follow up work to do to polish things up
Closes #56.
Author: Wes McKinney <we...@twosigma.com>
Closes #322 from wesm/ARROW-33 and squashes the following commits:
61afe42 [Wes McKinney] Some API cleaning in builder.h
86511a3 [Wes McKinney] Python fixes, clang warning fixes
9a00870 [Wes McKinney] Make ApproxEquals for floating point arrays work on slices
2a13929 [Wes McKinney] Implement slicing IPC logic for dense array
4f08628 [Wes McKinney] Add missing include
1a6fcb4 [Wes McKinney] Make some more progress. dense union needs more work
c6d814d [Wes McKinney] Work on adding sliced array support to IPC code path, with pretty printer and comparison fixed for sliced bitmaps, etc. Not all working yet
b6c511e [Wes McKinney] Add RecordBatch::Slice convenience method
8900d58 [Wes McKinney] Add Slice tests for DictionaryArray. Test recomputing the null count
55454d7 [Wes McKinney] Add slice tests for struct, union, string, list
a72653d [Wes McKinney] Rename offsets to value_offsets in list/binary/string/union for better clarity. Test Slice for primitive arrays
0355f71 [Wes McKinney] Implement CopyBitmap function
a228b50 [Wes McKinney] Implement Slice methods on Array classes
e502901 [Wes McKinney] Move null_count and offset as last two parameters of all array ctors. Implement/test bitmap set bit count with offset
bae6922 [Wes McKinney] Temporary work on adding offset parameter to Array classes for slicing
Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/5439b715
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/5439b715
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/5439b715
Branch: refs/heads/master
Commit: 5439b71586f4b0f9a36544b9e2417ee6ad7b48e8
Parents: 74bc4dd
Author: Wes McKinney <we...@twosigma.com>
Authored: Mon Feb 6 11:25:18 2017 -0500
Committer: Wes McKinney <we...@twosigma.com>
Committed: Mon Feb 6 11:25:18 2017 -0500
----------------------------------------------------------------------
cpp/src/arrow/CMakeLists.txt | 1 +
cpp/src/arrow/array-dictionary-test.cc | 62 ++++--
cpp/src/arrow/array-list-test.cc | 36 +++-
cpp/src/arrow/array-primitive-test.cc | 78 ++++++-
cpp/src/arrow/array-string-test.cc | 90 ++++++--
cpp/src/arrow/array-struct-test.cc | 19 +-
cpp/src/arrow/array-test.cc | 32 ++-
cpp/src/arrow/array-union-test.cc | 67 ++++++
cpp/src/arrow/array.cc | 233 ++++++++++++++-------
cpp/src/arrow/array.h | 265 +++++++++++++++---------
cpp/src/arrow/buffer.cc | 16 ++
cpp/src/arrow/buffer.h | 21 +-
cpp/src/arrow/builder.cc | 64 +++---
cpp/src/arrow/builder.h | 21 +-
cpp/src/arrow/column-test.cc | 14 +-
cpp/src/arrow/compare.cc | 122 ++++++++---
cpp/src/arrow/io/file.cc | 4 +-
cpp/src/arrow/io/hdfs.cc | 8 +-
cpp/src/arrow/io/io-hdfs-test.cc | 10 +-
cpp/src/arrow/io/io-memory-test.cc | 4 +-
cpp/src/arrow/ipc/adapter.cc | 260 +++++++++++++++++++----
cpp/src/arrow/ipc/adapter.h | 8 +-
cpp/src/arrow/ipc/ipc-adapter-test.cc | 52 ++++-
cpp/src/arrow/ipc/ipc-json-test.cc | 21 +-
cpp/src/arrow/ipc/json-integration-test.cc | 6 +-
cpp/src/arrow/ipc/json-internal.cc | 37 ++--
cpp/src/arrow/ipc/stream.cc | 15 +-
cpp/src/arrow/ipc/stream.h | 8 +
cpp/src/arrow/ipc/test-common.h | 79 ++++---
cpp/src/arrow/pretty_print-test.cc | 6 +-
cpp/src/arrow/pretty_print.cc | 53 +++--
cpp/src/arrow/table-test.cc | 26 +++
cpp/src/arrow/table.cc | 19 +-
cpp/src/arrow/table.h | 4 +
cpp/src/arrow/test-util.h | 43 +---
cpp/src/arrow/type.cc | 6 +-
cpp/src/arrow/type.h | 8 +-
cpp/src/arrow/type_traits.h | 9 +
cpp/src/arrow/util/bit-util-test.cc | 62 +++++-
cpp/src/arrow/util/bit-util.cc | 83 +++++++-
cpp/src/arrow/util/bit-util.h | 45 ++++
cpp/src/arrow/util/logging.h | 4 +-
cpp/src/arrow/util/macros.h | 2 +-
python/CMakeLists.txt | 2 +-
python/pyarrow/includes/libarrow.pxd | 4 +-
python/pyarrow/scalar.pyx | 2 +-
python/src/pyarrow/adapters/builtin.cc | 2 +-
python/src/pyarrow/adapters/pandas.cc | 20 +-
python/src/pyarrow/io.cc | 21 +-
49 files changed, 1524 insertions(+), 550 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/CMakeLists.txt
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index b002bb7..824ced1 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -53,6 +53,7 @@ ADD_ARROW_TEST(array-list-test)
ADD_ARROW_TEST(array-primitive-test)
ADD_ARROW_TEST(array-string-test)
ADD_ARROW_TEST(array-struct-test)
+ADD_ARROW_TEST(array-union-test)
ADD_ARROW_TEST(buffer-test)
ADD_ARROW_TEST(column-test)
ADD_ARROW_TEST(memory_pool-test)
http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/array-dictionary-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-dictionary-test.cc b/cpp/src/arrow/array-dictionary-test.cc
index 1a0d49a..61381b7 100644
--- a/cpp/src/arrow/array-dictionary-test.cc
+++ b/cpp/src/arrow/array-dictionary-test.cc
@@ -34,7 +34,7 @@ namespace arrow {
TEST(TestDictionary, Basics) {
std::vector<int32_t> values = {100, 1000, 10000, 100000};
std::shared_ptr<Array> dict;
- ArrayFromVector<Int32Type, int32_t>(int32(), values, &dict);
+ ArrayFromVector<Int32Type, int32_t>(values, &dict);
std::shared_ptr<DictionaryType> type1 =
std::dynamic_pointer_cast<DictionaryType>(dictionary(int16(), dict));
@@ -54,45 +54,67 @@ TEST(TestDictionary, Equals) {
std::shared_ptr<Array> dict;
std::vector<std::string> dict_values = {"foo", "bar", "baz"};
- ArrayFromVector<StringType, std::string>(utf8(), dict_values, &dict);
+ ArrayFromVector<StringType, std::string>(dict_values, &dict);
std::shared_ptr<DataType> dict_type = dictionary(int16(), dict);
std::shared_ptr<Array> dict2;
std::vector<std::string> dict2_values = {"foo", "bar", "baz", "qux"};
- ArrayFromVector<StringType, std::string>(utf8(), dict2_values, &dict2);
+ ArrayFromVector<StringType, std::string>(dict2_values, &dict2);
std::shared_ptr<DataType> dict2_type = dictionary(int16(), dict2);
std::shared_ptr<Array> indices;
std::vector<int16_t> indices_values = {1, 2, -1, 0, 2, 0};
- ArrayFromVector<Int16Type, int16_t>(int16(), is_valid, indices_values, &indices);
+ ArrayFromVector<Int16Type, int16_t>(is_valid, indices_values, &indices);
std::shared_ptr<Array> indices2;
std::vector<int16_t> indices2_values = {1, 2, 0, 0, 2, 0};
- ArrayFromVector<Int16Type, int16_t>(int16(), is_valid, indices2_values, &indices2);
+ ArrayFromVector<Int16Type, int16_t>(is_valid, indices2_values, &indices2);
std::shared_ptr<Array> indices3;
std::vector<int16_t> indices3_values = {1, 1, 0, 0, 2, 0};
- ArrayFromVector<Int16Type, int16_t>(int16(), is_valid, indices3_values, &indices3);
+ ArrayFromVector<Int16Type, int16_t>(is_valid, indices3_values, &indices3);
- auto arr = std::make_shared<DictionaryArray>(dict_type, indices);
- auto arr2 = std::make_shared<DictionaryArray>(dict_type, indices2);
- auto arr3 = std::make_shared<DictionaryArray>(dict2_type, indices);
- auto arr4 = std::make_shared<DictionaryArray>(dict_type, indices3);
+ auto array = std::make_shared<DictionaryArray>(dict_type, indices);
+ auto array2 = std::make_shared<DictionaryArray>(dict_type, indices2);
+ auto array3 = std::make_shared<DictionaryArray>(dict2_type, indices);
+ auto array4 = std::make_shared<DictionaryArray>(dict_type, indices3);
- ASSERT_TRUE(arr->Equals(arr));
+ ASSERT_TRUE(array->Equals(array));
// Equal, because the unequal index is masked by null
- ASSERT_TRUE(arr->Equals(arr2));
+ ASSERT_TRUE(array->Equals(array2));
// Unequal dictionaries
- ASSERT_FALSE(arr->Equals(arr3));
+ ASSERT_FALSE(array->Equals(array3));
// Unequal indices
- ASSERT_FALSE(arr->Equals(arr4));
+ ASSERT_FALSE(array->Equals(array4));
// RangeEquals
- ASSERT_TRUE(arr->RangeEquals(3, 6, 3, arr4));
- ASSERT_FALSE(arr->RangeEquals(1, 3, 1, arr4));
+ ASSERT_TRUE(array->RangeEquals(3, 6, 3, array4));
+ ASSERT_FALSE(array->RangeEquals(1, 3, 1, array4));
+
+ // ARROW-33 Test slices
+ const int size = array->length();
+
+ std::shared_ptr<Array> slice, slice2;
+ slice = array->Array::Slice(2);
+ slice2 = array->Array::Slice(2);
+ ASSERT_EQ(size - 2, slice->length());
+
+ ASSERT_TRUE(slice->Equals(slice2));
+ ASSERT_TRUE(array->RangeEquals(2, array->length(), 0, slice));
+
+ // Chained slices
+ slice2 = array->Array::Slice(1)->Array::Slice(1);
+ ASSERT_TRUE(slice->Equals(slice2));
+
+ slice = array->Slice(1, 3);
+ slice2 = array->Slice(1, 3);
+ ASSERT_EQ(3, slice->length());
+
+ ASSERT_TRUE(slice->Equals(slice2));
+ ASSERT_TRUE(array->RangeEquals(1, 4, 0, slice));
}
TEST(TestDictionary, Validate) {
@@ -100,20 +122,20 @@ TEST(TestDictionary, Validate) {
std::shared_ptr<Array> dict;
std::vector<std::string> dict_values = {"foo", "bar", "baz"};
- ArrayFromVector<StringType, std::string>(utf8(), dict_values, &dict);
+ ArrayFromVector<StringType, std::string>(dict_values, &dict);
std::shared_ptr<DataType> dict_type = dictionary(int16(), dict);
std::shared_ptr<Array> indices;
std::vector<uint8_t> indices_values = {1, 2, 0, 0, 2, 0};
- ArrayFromVector<UInt8Type, uint8_t>(uint8(), is_valid, indices_values, &indices);
+ ArrayFromVector<UInt8Type, uint8_t>(is_valid, indices_values, &indices);
std::shared_ptr<Array> indices2;
std::vector<float> indices2_values = {1., 2., 0., 0., 2., 0.};
- ArrayFromVector<FloatType, float>(float32(), is_valid, indices2_values, &indices2);
+ ArrayFromVector<FloatType, float>(is_valid, indices2_values, &indices2);
std::shared_ptr<Array> indices3;
std::vector<int64_t> indices3_values = {1, 2, 0, 0, 2, 0};
- ArrayFromVector<Int64Type, int64_t>(int64(), is_valid, indices3_values, &indices3);
+ ArrayFromVector<Int64Type, int64_t>(is_valid, indices3_values, &indices3);
std::shared_ptr<Array> arr = std::make_shared<DictionaryArray>(dict_type, indices);
std::shared_ptr<Array> arr2 = std::make_shared<DictionaryArray>(dict_type, indices2);
http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/array-list-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-list-test.cc b/cpp/src/arrow/array-list-test.cc
index 8e4d319..a144fd9 100644
--- a/cpp/src/arrow/array-list-test.cc
+++ b/cpp/src/arrow/array-list-test.cc
@@ -90,9 +90,9 @@ TEST_F(TestListBuilder, Equality) {
Int32Builder* vb = static_cast<Int32Builder*>(builder_->value_builder().get());
std::shared_ptr<Array> array, equal_array, unequal_array;
- vector<int32_t> equal_offsets = {0, 1, 2, 5};
- vector<int32_t> equal_values = {1, 2, 3, 4, 5, 2, 2, 2};
- vector<int32_t> unequal_offsets = {0, 1, 4};
+ vector<int32_t> equal_offsets = {0, 1, 2, 5, 6, 7, 8, 10};
+ vector<int32_t> equal_values = {1, 2, 3, 4, 5, 2, 2, 2, 5, 6};
+ vector<int32_t> unequal_offsets = {0, 1, 4, 7};
vector<int32_t> unequal_values = {1, 2, 2, 2, 3, 4, 5};
// setup two equal arrays
@@ -122,7 +122,27 @@ TEST_F(TestListBuilder, Equality) {
EXPECT_FALSE(array->RangeEquals(0, 2, 0, unequal_array));
EXPECT_FALSE(array->RangeEquals(1, 2, 1, unequal_array));
EXPECT_TRUE(array->RangeEquals(2, 3, 2, unequal_array));
- EXPECT_TRUE(array->RangeEquals(3, 4, 1, unequal_array));
+
+ // Check with slices, ARROW-33
+ std::shared_ptr<Array> slice, slice2;
+
+ slice = array->Slice(2);
+ slice2 = array->Slice(2);
+ ASSERT_EQ(array->length() - 2, slice->length());
+
+ ASSERT_TRUE(slice->Equals(slice2));
+ ASSERT_TRUE(array->RangeEquals(2, slice->length(), 0, slice));
+
+ // Chained slices
+ slice2 = array->Slice(1)->Slice(1);
+ ASSERT_TRUE(slice->Equals(slice2));
+
+ slice = array->Slice(1, 4);
+ slice2 = array->Slice(1, 4);
+ ASSERT_EQ(4, slice->length());
+
+ ASSERT_TRUE(slice->Equals(slice2));
+ ASSERT_TRUE(array->RangeEquals(1, 5, 0, slice));
}
TEST_F(TestListBuilder, TestResize) {}
@@ -137,9 +157,9 @@ TEST_F(TestListBuilder, TestAppendNull) {
ASSERT_TRUE(result_->IsNull(0));
ASSERT_TRUE(result_->IsNull(1));
- ASSERT_EQ(0, result_->raw_offsets()[0]);
- ASSERT_EQ(0, result_->offset(1));
- ASSERT_EQ(0, result_->offset(2));
+ ASSERT_EQ(0, result_->raw_value_offsets()[0]);
+ ASSERT_EQ(0, result_->value_offset(1));
+ ASSERT_EQ(0, result_->value_offset(2));
Int32Array* values = static_cast<Int32Array*>(result_->values().get());
ASSERT_EQ(0, values->length());
@@ -154,7 +174,7 @@ void ValidateBasicListArray(const ListArray* result, const vector<int32_t>& valu
ASSERT_EQ(3, result->length());
vector<int32_t> ex_offsets = {0, 3, 3, 7};
for (size_t i = 0; i < ex_offsets.size(); ++i) {
- ASSERT_EQ(ex_offsets[i], result->offset(i));
+ ASSERT_EQ(ex_offsets[i], result->value_offset(i));
}
for (int i = 0; i < result->length(); ++i) {
http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/array-primitive-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-primitive-test.cc b/cpp/src/arrow/array-primitive-test.cc
index c839fb9..a20fdbf 100644
--- a/cpp/src/arrow/array-primitive-test.cc
+++ b/cpp/src/arrow/array-primitive-test.cc
@@ -121,7 +121,7 @@ class TestPrimitiveBuilder : public TestBuilder {
}
auto expected =
- std::make_shared<ArrayType>(size, ex_data, ex_null_count, ex_null_bitmap);
+ std::make_shared<ArrayType>(size, ex_data, ex_null_bitmap, ex_null_count);
std::shared_ptr<Array> out;
ASSERT_OK(builder->Finish(&out));
@@ -217,7 +217,7 @@ void TestPrimitiveBuilder<PBoolean>::Check(
}
auto expected =
- std::make_shared<BooleanArray>(size, ex_data, ex_null_count, ex_null_bitmap);
+ std::make_shared<BooleanArray>(size, ex_data, ex_null_bitmap, ex_null_count);
std::shared_ptr<Array> out;
ASSERT_OK(builder->Finish(&out));
@@ -235,15 +235,14 @@ void TestPrimitiveBuilder<PBoolean>::Check(
for (int i = 0; i < result->length(); ++i) {
if (nullable) { ASSERT_EQ(valid_bytes_[i] == 0, result->IsNull(i)) << i; }
- bool actual = BitUtil::GetBit(result->raw_data(), i);
+ bool actual = BitUtil::GetBit(result->data()->data(), i);
ASSERT_EQ(static_cast<bool>(draws_[i]), actual) << i;
}
ASSERT_TRUE(result->Equals(*expected));
}
typedef ::testing::Types<PBoolean, PUInt8, PUInt16, PUInt32, PUInt64, PInt8, PInt16,
- PInt32, PInt64, PFloat, PDouble>
- Primitives;
+ PInt32, PInt64, PFloat, PDouble> Primitives;
TYPED_TEST_CASE(TestPrimitiveBuilder, Primitives);
@@ -347,6 +346,39 @@ TYPED_TEST(TestPrimitiveBuilder, Equality) {
array->RangeEquals(first_valid_idx + 1, size, first_valid_idx + 1, unequal_array));
}
+TYPED_TEST(TestPrimitiveBuilder, SliceEquality) {
+ DECL_T();
+
+ const int size = 1000;
+ this->RandomData(size);
+ vector<T>& draws = this->draws_;
+ vector<uint8_t>& valid_bytes = this->valid_bytes_;
+ auto builder = this->builder_.get();
+
+ std::shared_ptr<Array> array;
+ ASSERT_OK(MakeArray(valid_bytes, draws, size, builder, &array));
+
+ std::shared_ptr<Array> slice, slice2;
+
+ slice = array->Slice(5);
+ slice2 = array->Slice(5);
+ ASSERT_EQ(size - 5, slice->length());
+
+ ASSERT_TRUE(slice->Equals(slice2));
+ ASSERT_TRUE(array->RangeEquals(5, array->length(), 0, slice));
+
+ // Chained slices
+ slice2 = array->Slice(2)->Slice(3);
+ ASSERT_TRUE(slice->Equals(slice2));
+
+ slice = array->Slice(5, 10);
+ slice2 = array->Slice(5, 10);
+ ASSERT_EQ(10, slice->length());
+
+ ASSERT_TRUE(slice->Equals(slice2));
+ ASSERT_TRUE(array->RangeEquals(5, 15, 0, slice));
+}
+
TYPED_TEST(TestPrimitiveBuilder, TestAppendScalar) {
DECL_T();
@@ -473,4 +505,40 @@ TYPED_TEST(TestPrimitiveBuilder, TestReserve) {
ASSERT_EQ(BitUtil::NextPower2(kMinBuilderCapacity + 100), this->builder_->capacity());
}
+template <typename TYPE>
+void CheckSliceApproxEquals() {
+ using T = typename TYPE::c_type;
+
+ const int kSize = 50;
+ std::vector<T> draws1;
+ std::vector<T> draws2;
+
+ const uint32_t kSeed = 0;
+ test::random_real<T>(kSize, kSeed, 0, 100, &draws1);
+ test::random_real<T>(kSize, kSeed + 1, 0, 100, &draws2);
+
+ // Make the draws equal in the sliced segment, but unequal elsewhere (to
+ // catch not using the slice offset)
+ for (int i = 10; i < 30; ++i) {
+ draws2[i] = draws1[i];
+ }
+
+ std::vector<bool> is_valid;
+ test::random_is_valid(kSize, 0.1, &is_valid);
+
+ std::shared_ptr<Array> array1, array2;
+ ArrayFromVector<TYPE, T>(is_valid, draws1, &array1);
+ ArrayFromVector<TYPE, T>(is_valid, draws2, &array2);
+
+ std::shared_ptr<Array> slice1 = array1->Slice(10, 20);
+ std::shared_ptr<Array> slice2 = array2->Slice(10, 20);
+
+ ASSERT_TRUE(slice1->ApproxEquals(slice2));
+}
+
+TEST(TestPrimitiveAdHoc, FloatingSliceApproxEquals) {
+ CheckSliceApproxEquals<FloatType>();
+ CheckSliceApproxEquals<DoubleType>();
+}
+
} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/array-string-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-string-test.cc b/cpp/src/arrow/array-string-test.cc
index 5ea384a..8b7eb41 100644
--- a/cpp/src/arrow/array-string-test.cc
+++ b/cpp/src/arrow/array-string-test.cc
@@ -27,6 +27,7 @@
#include "arrow/builder.h"
#include "arrow/test-util.h"
#include "arrow/type.h"
+#include "arrow/type_traits.h"
namespace arrow {
@@ -70,7 +71,7 @@ class TestStringArray : public ::testing::Test {
null_count_ = test::null_count(valid_bytes_);
strings_ = std::make_shared<StringArray>(
- length_, offsets_buf_, value_buf_, null_count_, null_bitmap_);
+ length_, offsets_buf_, value_buf_, null_bitmap_, null_count_);
}
protected:
@@ -114,7 +115,7 @@ TEST_F(TestStringArray, TestListFunctions) {
TEST_F(TestStringArray, TestDestructor) {
auto arr = std::make_shared<StringArray>(
- length_, offsets_buf_, value_buf_, null_count_, null_bitmap_);
+ length_, offsets_buf_, value_buf_, null_bitmap_, null_count_);
}
TEST_F(TestStringArray, TestGetString) {
@@ -133,9 +134,9 @@ TEST_F(TestStringArray, TestEmptyStringComparison) {
length_ = offsets_.size() - 1;
auto strings_a = std::make_shared<StringArray>(
- length_, offsets_buf_, nullptr, null_count_, null_bitmap_);
+ length_, offsets_buf_, nullptr, null_bitmap_, null_count_);
auto strings_b = std::make_shared<StringArray>(
- length_, offsets_buf_, nullptr, null_count_, null_bitmap_);
+ length_, offsets_buf_, nullptr, null_bitmap_, null_count_);
ASSERT_TRUE(strings_a->Equals(strings_b));
}
@@ -146,8 +147,7 @@ class TestStringBuilder : public TestBuilder {
public:
void SetUp() {
TestBuilder::SetUp();
- type_ = TypePtr(new StringType());
- builder_.reset(new StringBuilder(pool_, type_));
+ builder_.reset(new StringBuilder(pool_));
}
void Done() {
@@ -159,8 +159,6 @@ class TestStringBuilder : public TestBuilder {
}
protected:
- TypePtr type_;
-
std::unique_ptr<StringBuilder> builder_;
std::shared_ptr<StringArray> result_;
};
@@ -195,7 +193,7 @@ TEST_F(TestStringBuilder, TestScalarAppend) {
} else {
ASSERT_FALSE(result_->IsNull(i));
result_->GetValue(i, &length);
- ASSERT_EQ(pos, result_->offset(i));
+ ASSERT_EQ(pos, result_->value_offset(i));
ASSERT_EQ(static_cast<int>(strings[i % N].size()), length);
ASSERT_EQ(strings[i % N], result_->GetString(i));
@@ -232,7 +230,7 @@ class TestBinaryArray : public ::testing::Test {
null_count_ = test::null_count(valid_bytes_);
strings_ = std::make_shared<BinaryArray>(
- length_, offsets_buf_, value_buf_, null_count_, null_bitmap_);
+ length_, offsets_buf_, value_buf_, null_bitmap_, null_count_);
}
protected:
@@ -276,7 +274,7 @@ TEST_F(TestBinaryArray, TestListFunctions) {
TEST_F(TestBinaryArray, TestDestructor) {
auto arr = std::make_shared<BinaryArray>(
- length_, offsets_buf_, value_buf_, null_count_, null_bitmap_);
+ length_, offsets_buf_, value_buf_, null_bitmap_, null_count_);
}
TEST_F(TestBinaryArray, TestGetValue) {
@@ -306,8 +304,8 @@ TEST_F(TestBinaryArray, TestEqualsEmptyStrings) {
ASSERT_OK(builder.Finish(&left_arr));
const BinaryArray& left = static_cast<const BinaryArray&>(*left_arr);
- std::shared_ptr<Array> right = std::make_shared<BinaryArray>(
- left.length(), left.offsets(), nullptr, left.null_count(), left.null_bitmap());
+ std::shared_ptr<Array> right = std::make_shared<BinaryArray>(left.length(),
+ left.value_offsets(), nullptr, left.null_bitmap(), left.null_count());
ASSERT_TRUE(left.Equals(right));
ASSERT_TRUE(left.RangeEquals(0, left.length(), 0, right));
@@ -317,8 +315,7 @@ class TestBinaryBuilder : public TestBuilder {
public:
void SetUp() {
TestBuilder::SetUp();
- type_ = TypePtr(new BinaryType());
- builder_.reset(new BinaryBuilder(pool_, type_));
+ builder_.reset(new BinaryBuilder(pool_));
}
void Done() {
@@ -330,8 +327,6 @@ class TestBinaryBuilder : public TestBuilder {
}
protected:
- TypePtr type_;
-
std::unique_ptr<BinaryBuilder> builder_;
std::shared_ptr<BinaryArray> result_;
};
@@ -348,8 +343,7 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) {
if (is_null[i]) {
builder_->AppendNull();
} else {
- builder_->Append(
- reinterpret_cast<const uint8_t*>(strings[i].data()), strings[i].size());
+ builder_->Append(strings[i]);
}
}
}
@@ -377,4 +371,62 @@ TEST_F(TestBinaryBuilder, TestZeroLength) {
Done();
}
+// ----------------------------------------------------------------------
+// Slice tests
+
+template <typename TYPE>
+void CheckSliceEquality() {
+ using Traits = TypeTraits<TYPE>;
+ using BuilderType = typename Traits::BuilderType;
+
+ BuilderType builder(default_memory_pool());
+
+ std::vector<std::string> strings = {"foo", "", "bar", "baz", "qux", ""};
+ std::vector<uint8_t> is_null = {0, 1, 0, 1, 0, 0};
+
+ int N = strings.size();
+ int reps = 10;
+
+ for (int j = 0; j < reps; ++j) {
+ for (int i = 0; i < N; ++i) {
+ if (is_null[i]) {
+ builder.AppendNull();
+ } else {
+ builder.Append(strings[i]);
+ }
+ }
+ }
+
+ std::shared_ptr<Array> array;
+ ASSERT_OK(builder.Finish(&array));
+
+ std::shared_ptr<Array> slice, slice2;
+
+ slice = array->Slice(5);
+ slice2 = array->Slice(5);
+ ASSERT_EQ(N * reps - 5, slice->length());
+
+ ASSERT_TRUE(slice->Equals(slice2));
+ ASSERT_TRUE(array->RangeEquals(5, slice->length(), 0, slice));
+
+ // Chained slices
+ slice2 = array->Slice(2)->Slice(3);
+ ASSERT_TRUE(slice->Equals(slice2));
+
+ slice = array->Slice(5, 20);
+ slice2 = array->Slice(5, 20);
+ ASSERT_EQ(20, slice->length());
+
+ ASSERT_TRUE(slice->Equals(slice2));
+ ASSERT_TRUE(array->RangeEquals(5, 25, 0, slice));
+}
+
+TEST_F(TestBinaryArray, TestSliceEquality) {
+ CheckSliceEquality<BinaryType>();
+}
+
+TEST_F(TestStringArray, TestSliceEquality) {
+ CheckSliceEquality<BinaryType>();
+}
+
} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/array-struct-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-struct-test.cc b/cpp/src/arrow/array-struct-test.cc
index 5827c39..f4e7409 100644
--- a/cpp/src/arrow/array-struct-test.cc
+++ b/cpp/src/arrow/array-struct-test.cc
@@ -75,7 +75,7 @@ void ValidateBasicStructArray(const StructArray* result,
ASSERT_EQ(4, list_char_arr->length());
ASSERT_EQ(10, list_char_arr->values()->length());
for (size_t i = 0; i < list_offsets.size(); ++i) {
- ASSERT_EQ(list_offsets[i], list_char_arr->raw_offsets()[i]);
+ ASSERT_EQ(list_offsets[i], list_char_arr->raw_value_offsets()[i]);
}
for (size_t i = 0; i < list_values.size(); ++i) {
ASSERT_EQ(list_values[i], char_arr->Value(i));
@@ -381,6 +381,23 @@ TEST_F(TestStructBuilder, TestEquality) {
EXPECT_FALSE(array->RangeEquals(0, 1, 0, unequal_values_array));
EXPECT_TRUE(array->RangeEquals(1, 3, 1, unequal_values_array));
EXPECT_FALSE(array->RangeEquals(3, 4, 3, unequal_values_array));
+
+ // ARROW-33 Slice / equality
+ std::shared_ptr<Array> slice, slice2;
+
+ slice = array->Slice(2);
+ slice2 = array->Slice(2);
+ ASSERT_EQ(array->length() - 2, slice->length());
+
+ ASSERT_TRUE(slice->Equals(slice2));
+ ASSERT_TRUE(array->RangeEquals(2, slice->length(), 0, slice));
+
+ slice = array->Slice(1, 2);
+ slice2 = array->Slice(1, 2);
+ ASSERT_EQ(2, slice->length());
+
+ ASSERT_TRUE(slice->Equals(slice2));
+ ASSERT_TRUE(array->RangeEquals(1, 3, 0, slice));
}
TEST_F(TestStructBuilder, TestZeroLength) {
http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/array-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index a1d8fdf..45130d8 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -43,7 +43,7 @@ TEST_F(TestArray, TestNullCount) {
auto data = std::make_shared<PoolBuffer>(pool_);
auto null_bitmap = std::make_shared<PoolBuffer>(pool_);
- std::unique_ptr<Int32Array> arr(new Int32Array(100, data, 10, null_bitmap));
+ std::unique_ptr<Int32Array> arr(new Int32Array(100, data, null_bitmap, 10));
ASSERT_EQ(10, arr->null_count());
std::unique_ptr<Int32Array> arr_no_nulls(new Int32Array(100, data));
@@ -67,7 +67,7 @@ std::shared_ptr<Array> MakeArrayFromValidBytes(
}
std::shared_ptr<Array> arr(
- new Int32Array(v.size(), value_builder.Finish(), null_count, null_buf));
+ new Int32Array(v.size(), value_builder.Finish(), null_buf, null_count));
return arr;
}
@@ -87,6 +87,32 @@ TEST_F(TestArray, TestEquality) {
EXPECT_FALSE(array->RangeEquals(1, 2, 1, unequal_array));
}
+TEST_F(TestArray, SliceRecomputeNullCount) {
+ std::vector<uint8_t> valid_bytes = {1, 0, 1, 1, 0, 1, 0, 0};
+
+ auto array = MakeArrayFromValidBytes(valid_bytes, pool_);
+
+ ASSERT_EQ(4, array->null_count());
+
+ auto slice = array->Slice(1, 4);
+ ASSERT_EQ(2, slice->null_count());
+
+ slice = array->Slice(4);
+ ASSERT_EQ(1, slice->null_count());
+
+ slice = array->Slice(0);
+ ASSERT_EQ(4, slice->null_count());
+
+ // No bitmap, compute 0
+ std::shared_ptr<MutableBuffer> data;
+ const int kBufferSize = 64;
+ ASSERT_OK(AllocateBuffer(pool_, kBufferSize, &data));
+ memset(data->mutable_data(), 0, kBufferSize);
+
+ auto arr = std::make_shared<Int32Array>(16, data, nullptr, -1);
+ ASSERT_EQ(0, arr->null_count());
+}
+
TEST_F(TestArray, TestIsNull) {
// clang-format off
std::vector<uint8_t> null_bitmap = {1, 0, 1, 1, 0, 1, 0, 0,
@@ -102,7 +128,7 @@ TEST_F(TestArray, TestIsNull) {
std::shared_ptr<Buffer> null_buf = test::bytes_to_null_buffer(null_bitmap);
std::unique_ptr<Array> arr;
- arr.reset(new Int32Array(null_bitmap.size(), nullptr, null_count, null_buf));
+ arr.reset(new Int32Array(null_bitmap.size(), nullptr, null_buf, null_count));
ASSERT_EQ(null_count, arr->null_count());
ASSERT_EQ(5, null_buf->size());
http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/array-union-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-union-test.cc b/cpp/src/arrow/array-union-test.cc
new file mode 100644
index 0000000..eb9bd7d
--- /dev/null
+++ b/cpp/src/arrow/array-union-test.cc
@@ -0,0 +1,67 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Tests for UnionArray
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "arrow/array.h"
+#include "arrow/builder.h"
+#include "arrow/ipc/test-common.h"
+#include "arrow/status.h"
+#include "arrow/table.h"
+#include "arrow/test-util.h"
+#include "arrow/type.h"
+
+namespace arrow {
+
+TEST(TestUnionArrayAdHoc, TestSliceEquals) {
+ std::shared_ptr<RecordBatch> batch;
+ ASSERT_OK(ipc::MakeUnion(&batch));
+
+ const int size = batch->num_rows();
+
+ auto CheckUnion = [&size](std::shared_ptr<Array> array) {
+ std::shared_ptr<Array> slice, slice2;
+ slice = array->Slice(2);
+ slice2 = array->Slice(2);
+ ASSERT_EQ(size - 2, slice->length());
+
+ ASSERT_TRUE(slice->Equals(slice2));
+ ASSERT_TRUE(array->RangeEquals(2, array->length(), 0, slice));
+
+ // Chained slices
+ slice2 = array->Slice(1)->Slice(1);
+ ASSERT_TRUE(slice->Equals(slice2));
+
+ slice = array->Slice(1, 5);
+ slice2 = array->Slice(1, 5);
+ ASSERT_EQ(5, slice->length());
+
+ ASSERT_TRUE(slice->Equals(slice2));
+ ASSERT_TRUE(array->RangeEquals(1, 6, 0, slice));
+ };
+
+ CheckUnion(batch->column(1));
+ CheckUnion(batch->column(2));
+}
+
+} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/array.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index 6fc7fb6..f84023e 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -17,6 +17,7 @@
#include "arrow/array.h"
+#include <algorithm>
#include <cstdint>
#include <cstring>
#include <sstream>
@@ -30,28 +31,37 @@
namespace arrow {
-Status GetEmptyBitmap(
- MemoryPool* pool, int32_t length, std::shared_ptr<MutableBuffer>* result) {
- auto buffer = std::make_shared<PoolBuffer>(pool);
- RETURN_NOT_OK(buffer->Resize(BitUtil::BytesForBits(length)));
- memset(buffer->mutable_data(), 0, buffer->size());
-
- *result = buffer;
- return Status::OK();
-}
+// When slicing, we do not know the null count of the sliced range without
+// doing some computation. To avoid doing this eagerly, we set the null count
+// to -1 (any negative number will do). When Array::null_count is called the
+// first time, the null count will be computed. See ARROW-33
+constexpr int32_t kUnknownNullCount = -1;
// ----------------------------------------------------------------------
// Base array class
-Array::Array(const std::shared_ptr<DataType>& type, int32_t length, int32_t null_count,
- const std::shared_ptr<Buffer>& null_bitmap) {
- type_ = type;
- length_ = length;
- null_count_ = null_count;
- null_bitmap_ = null_bitmap;
+Array::Array(const std::shared_ptr<DataType>& type, int32_t length,
+ const std::shared_ptr<Buffer>& null_bitmap, int32_t null_count, int32_t offset)
+ : type_(type),
+ length_(length),
+ offset_(offset),
+ null_count_(null_count),
+ null_bitmap_(null_bitmap),
+ null_bitmap_data_(nullptr) {
if (null_bitmap_) { null_bitmap_data_ = null_bitmap_->data(); }
}
+int32_t Array::null_count() const {
+ if (null_count_ < 0) {
+ if (null_bitmap_) {
+ null_count_ = CountSetBits(null_bitmap_data_, offset_, length_);
+ } else {
+ null_count_ = 0;
+ }
+ }
+ return null_count_;
+}
+
bool Array::Equals(const Array& arr) const {
bool are_equal = false;
Status error = ArrayEquals(*this, arr, &are_equal);
@@ -86,10 +96,32 @@ bool Array::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_
return are_equal;
}
+// Last two parameters are in-out parameters
+static inline void ConformSliceParams(
+ int32_t array_offset, int32_t array_length, int32_t* offset, int32_t* length) {
+ DCHECK_LE(*offset, array_length);
+ DCHECK_GE(offset, 0);
+ *length = std::min(array_length - *offset, *length);
+ *offset = array_offset + *offset;
+}
+
+std::shared_ptr<Array> Array::Slice(int32_t offset) const {
+ int32_t slice_length = length_ - offset;
+ return Slice(offset, slice_length);
+}
+
Status Array::Validate() const {
return Status::OK();
}
+NullArray::NullArray(int32_t length) : Array(null(), length, nullptr, length) {}
+
+std::shared_ptr<Array> NullArray::Slice(int32_t offset, int32_t length) const {
+ DCHECK_LE(offset, length_);
+ length = std::min(length_ - offset, length);
+ return std::make_shared<NullArray>(length);
+}
+
Status NullArray::Accept(ArrayVisitor* visitor) const {
return visitor->Visit(*this);
}
@@ -98,9 +130,9 @@ Status NullArray::Accept(ArrayVisitor* visitor) const {
// Primitive array base
PrimitiveArray::PrimitiveArray(const std::shared_ptr<DataType>& type, int32_t length,
- const std::shared_ptr<Buffer>& data, int32_t null_count,
- const std::shared_ptr<Buffer>& null_bitmap)
- : Array(type, length, null_count, null_bitmap) {
+ const std::shared_ptr<Buffer>& data, const std::shared_ptr<Buffer>& null_bitmap,
+ int32_t null_count, int32_t offset)
+ : Array(type, length, null_bitmap, null_count, offset) {
data_ = data;
raw_data_ = data == nullptr ? nullptr : data_->data();
}
@@ -110,6 +142,13 @@ Status NumericArray<T>::Accept(ArrayVisitor* visitor) const {
return visitor->Visit(*this);
}
+template <typename T>
+std::shared_ptr<Array> NumericArray<T>::Slice(int32_t offset, int32_t length) const {
+ ConformSliceParams(offset_, length_, &offset, &length);
+ return std::make_shared<NumericArray<T>>(
+ type_, length, data_, null_bitmap_, kUnknownNullCount, offset);
+}
+
template class NumericArray<UInt8Type>;
template class NumericArray<UInt16Type>;
template class NumericArray<UInt32Type>;
@@ -129,32 +168,33 @@ template class NumericArray<DoubleType>;
// BooleanArray
BooleanArray::BooleanArray(int32_t length, const std::shared_ptr<Buffer>& data,
- int32_t null_count, const std::shared_ptr<Buffer>& null_bitmap)
- : PrimitiveArray(
- std::make_shared<BooleanType>(), length, data, null_count, null_bitmap) {}
-
-BooleanArray::BooleanArray(const std::shared_ptr<DataType>& type, int32_t length,
- const std::shared_ptr<Buffer>& data, int32_t null_count,
- const std::shared_ptr<Buffer>& null_bitmap)
- : PrimitiveArray(type, length, data, null_count, null_bitmap) {}
+ const std::shared_ptr<Buffer>& null_bitmap, int32_t null_count, int32_t offset)
+ : PrimitiveArray(std::make_shared<BooleanType>(), length, data, null_bitmap,
+ null_count, offset) {}
Status BooleanArray::Accept(ArrayVisitor* visitor) const {
return visitor->Visit(*this);
}
+std::shared_ptr<Array> BooleanArray::Slice(int32_t offset, int32_t length) const {
+ ConformSliceParams(offset_, length_, &offset, &length);
+ return std::make_shared<BooleanArray>(
+ length, data_, null_bitmap_, kUnknownNullCount, offset);
+}
+
// ----------------------------------------------------------------------
// ListArray
Status ListArray::Validate() const {
if (length_ < 0) { return Status::Invalid("Length was negative"); }
- if (!offsets_buffer_) { return Status::Invalid("offsets_buffer_ was null"); }
- if (offsets_buffer_->size() / static_cast<int>(sizeof(int32_t)) < length_) {
+ if (!value_offsets_) { return Status::Invalid("value_offsets_ was null"); }
+ if (value_offsets_->size() / static_cast<int>(sizeof(int32_t)) < length_) {
std::stringstream ss;
- ss << "offset buffer size (bytes): " << offsets_buffer_->size()
+ ss << "offset buffer size (bytes): " << value_offsets_->size()
<< " isn't large enough for length: " << length_;
return Status::Invalid(ss.str());
}
- const int32_t last_offset = offset(length_);
+ const int32_t last_offset = this->value_offset(length_);
if (last_offset > 0) {
if (!values_) {
return Status::Invalid("last offset was non-zero and values was null");
@@ -174,14 +214,15 @@ Status ListArray::Validate() const {
}
}
- int32_t prev_offset = offset(0);
+ int32_t prev_offset = this->value_offset(0);
if (prev_offset != 0) { return Status::Invalid("The first offset wasn't zero"); }
for (int32_t i = 1; i <= length_; ++i) {
- int32_t current_offset = offset(i);
+ int32_t current_offset = this->value_offset(i);
if (IsNull(i - 1) && current_offset != prev_offset) {
std::stringstream ss;
- ss << "Offset invariant failure at: " << i << " inconsistent offsets for null slot"
- << current_offset << "!=" << prev_offset;
+ ss << "Offset invariant failure at: " << i
+ << " inconsistent value_offsets for null slot" << current_offset
+ << "!=" << prev_offset;
return Status::Invalid(ss.str());
}
if (current_offset < prev_offset) {
@@ -200,26 +241,33 @@ Status ListArray::Accept(ArrayVisitor* visitor) const {
return visitor->Visit(*this);
}
+std::shared_ptr<Array> ListArray::Slice(int32_t offset, int32_t length) const {
+ ConformSliceParams(offset_, length_, &offset, &length);
+ return std::make_shared<ListArray>(
+ type_, length, value_offsets_, values_, null_bitmap_, kUnknownNullCount, offset);
+}
+
// ----------------------------------------------------------------------
// String and binary
static std::shared_ptr<DataType> kBinary = std::make_shared<BinaryType>();
static std::shared_ptr<DataType> kString = std::make_shared<StringType>();
-BinaryArray::BinaryArray(int32_t length, const std::shared_ptr<Buffer>& offsets,
- const std::shared_ptr<Buffer>& data, int32_t null_count,
- const std::shared_ptr<Buffer>& null_bitmap)
- : BinaryArray(kBinary, length, offsets, data, null_count, null_bitmap) {}
+BinaryArray::BinaryArray(int32_t length, const std::shared_ptr<Buffer>& value_offsets,
+ const std::shared_ptr<Buffer>& data, const std::shared_ptr<Buffer>& null_bitmap,
+ int32_t null_count, int32_t offset)
+ : BinaryArray(kBinary, length, value_offsets, data, null_bitmap, null_count, offset) {
+}
BinaryArray::BinaryArray(const std::shared_ptr<DataType>& type, int32_t length,
- const std::shared_ptr<Buffer>& offsets, const std::shared_ptr<Buffer>& data,
- int32_t null_count, const std::shared_ptr<Buffer>& null_bitmap)
- : Array(type, length, null_count, null_bitmap),
- offsets_buffer_(offsets),
- offsets_(reinterpret_cast<const int32_t*>(offsets_buffer_->data())),
- data_buffer_(data),
- data_(nullptr) {
- if (data_buffer_ != nullptr) { data_ = data_buffer_->data(); }
+ const std::shared_ptr<Buffer>& value_offsets, const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap, int32_t null_count, int32_t offset)
+ : Array(type, length, null_bitmap, null_count, offset),
+ value_offsets_(value_offsets),
+ raw_value_offsets_(reinterpret_cast<const int32_t*>(value_offsets_->data())),
+ data_(data),
+ raw_data_(nullptr) {
+ if (data_ != nullptr) { raw_data_ = data_->data(); }
}
Status BinaryArray::Validate() const {
@@ -231,10 +279,17 @@ Status BinaryArray::Accept(ArrayVisitor* visitor) const {
return visitor->Visit(*this);
}
-StringArray::StringArray(int32_t length, const std::shared_ptr<Buffer>& offsets,
- const std::shared_ptr<Buffer>& data, int32_t null_count,
- const std::shared_ptr<Buffer>& null_bitmap)
- : BinaryArray(kString, length, offsets, data, null_count, null_bitmap) {}
+std::shared_ptr<Array> BinaryArray::Slice(int32_t offset, int32_t length) const {
+ ConformSliceParams(offset_, length_, &offset, &length);
+ return std::make_shared<BinaryArray>(
+ length, value_offsets_, data_, null_bitmap_, kUnknownNullCount, offset);
+}
+
+StringArray::StringArray(int32_t length, const std::shared_ptr<Buffer>& value_offsets,
+ const std::shared_ptr<Buffer>& data, const std::shared_ptr<Buffer>& null_bitmap,
+ int32_t null_count, int32_t offset)
+ : BinaryArray(kString, length, value_offsets, data, null_bitmap, null_count, offset) {
+}
Status StringArray::Validate() const {
// TODO(emkornfield) Validate proper UTF8 code points?
@@ -245,12 +300,26 @@ Status StringArray::Accept(ArrayVisitor* visitor) const {
return visitor->Visit(*this);
}
+std::shared_ptr<Array> StringArray::Slice(int32_t offset, int32_t length) const {
+ ConformSliceParams(offset_, length_, &offset, &length);
+ return std::make_shared<StringArray>(
+ length, value_offsets_, data_, null_bitmap_, kUnknownNullCount, offset);
+}
+
// ----------------------------------------------------------------------
// Struct
+StructArray::StructArray(const std::shared_ptr<DataType>& type, int32_t length,
+ const std::vector<std::shared_ptr<Array>>& children,
+ std::shared_ptr<Buffer> null_bitmap, int32_t null_count, int32_t offset)
+ : Array(type, length, null_bitmap, null_count, offset) {
+ type_ = type;
+ children_ = children;
+}
+
std::shared_ptr<Array> StructArray::field(int32_t pos) const {
- DCHECK_GT(field_arrays_.size(), 0);
- return field_arrays_[pos];
+ DCHECK_GT(children_.size(), 0);
+ return children_[pos];
}
Status StructArray::Validate() const {
@@ -260,11 +329,11 @@ Status StructArray::Validate() const {
return Status::Invalid("Null count exceeds the length of this struct");
}
- if (field_arrays_.size() > 0) {
+ if (children_.size() > 0) {
// Validate fields
- int32_t array_length = field_arrays_[0]->length();
+ int32_t array_length = children_[0]->length();
size_t idx = 0;
- for (auto it : field_arrays_) {
+ for (auto it : children_) {
if (it->length() != array_length) {
std::stringstream ss;
ss << "Length is not equal from field " << it->type()->ToString()
@@ -293,19 +362,27 @@ Status StructArray::Accept(ArrayVisitor* visitor) const {
return visitor->Visit(*this);
}
+std::shared_ptr<Array> StructArray::Slice(int32_t offset, int32_t length) const {
+ ConformSliceParams(offset_, length_, &offset, &length);
+ return std::make_shared<StructArray>(
+ type_, length, children_, null_bitmap_, kUnknownNullCount, offset);
+}
+
// ----------------------------------------------------------------------
// UnionArray
UnionArray::UnionArray(const std::shared_ptr<DataType>& type, int32_t length,
const std::vector<std::shared_ptr<Array>>& children,
- const std::shared_ptr<Buffer>& type_ids, const std::shared_ptr<Buffer>& offsets,
- int32_t null_count, const std::shared_ptr<Buffer>& null_bitmap)
- : Array(type, length, null_count, null_bitmap),
+ const std::shared_ptr<Buffer>& type_ids, const std::shared_ptr<Buffer>& value_offsets,
+ const std::shared_ptr<Buffer>& null_bitmap, int32_t null_count, int32_t offset)
+ : Array(type, length, null_bitmap, null_count, offset),
children_(children),
- type_ids_buffer_(type_ids),
- offsets_buffer_(offsets) {
- type_ids_ = reinterpret_cast<const uint8_t*>(type_ids->data());
- if (offsets) { offsets_ = reinterpret_cast<const int32_t*>(offsets->data()); }
+ type_ids_(type_ids),
+ value_offsets_(value_offsets) {
+ raw_type_ids_ = reinterpret_cast<const uint8_t*>(type_ids->data());
+ if (value_offsets) {
+ raw_value_offsets_ = reinterpret_cast<const int32_t*>(value_offsets->data());
+ }
}
std::shared_ptr<Array> UnionArray::child(int32_t pos) const {
@@ -328,18 +405,24 @@ Status UnionArray::Accept(ArrayVisitor* visitor) const {
return visitor->Visit(*this);
}
+std::shared_ptr<Array> UnionArray::Slice(int32_t offset, int32_t length) const {
+ ConformSliceParams(offset_, length_, &offset, &length);
+ return std::make_shared<UnionArray>(type_, length, children_, type_ids_, value_offsets_,
+ null_bitmap_, kUnknownNullCount, offset);
+}
+
// ----------------------------------------------------------------------
// DictionaryArray
Status DictionaryArray::FromBuffer(const std::shared_ptr<DataType>& type, int32_t length,
- const std::shared_ptr<Buffer>& indices, int32_t null_count,
- const std::shared_ptr<Buffer>& null_bitmap, std::shared_ptr<DictionaryArray>* out) {
+ const std::shared_ptr<Buffer>& indices, const std::shared_ptr<Buffer>& null_bitmap,
+ int32_t null_count, int32_t offset, std::shared_ptr<DictionaryArray>* out) {
DCHECK_EQ(type->type, Type::DICTIONARY);
const auto& dict_type = static_cast<const DictionaryType*>(type.get());
std::shared_ptr<Array> boxed_indices;
- RETURN_NOT_OK(MakePrimitiveArray(
- dict_type->index_type(), length, indices, null_count, null_bitmap, &boxed_indices));
+ RETURN_NOT_OK(MakePrimitiveArray(dict_type->index_type(), length, indices, null_bitmap,
+ null_count, offset, &boxed_indices));
*out = std::make_shared<DictionaryArray>(type, boxed_indices);
return Status::OK();
@@ -347,7 +430,8 @@ Status DictionaryArray::FromBuffer(const std::shared_ptr<DataType>& type, int32_
DictionaryArray::DictionaryArray(
const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& indices)
- : Array(type, indices->length(), indices->null_count(), indices->null_bitmap()),
+ : Array(type, indices->length(), indices->null_bitmap(), indices->null_count(),
+ indices->offset()),
dict_type_(static_cast<const DictionaryType*>(type.get())),
indices_(indices) {
DCHECK_EQ(type->type, Type::DICTIONARY);
@@ -369,16 +453,21 @@ Status DictionaryArray::Accept(ArrayVisitor* visitor) const {
return visitor->Visit(*this);
}
+std::shared_ptr<Array> DictionaryArray::Slice(int32_t offset, int32_t length) const {
+ std::shared_ptr<Array> sliced_indices = indices_->Slice(offset, length);
+ return std::make_shared<DictionaryArray>(type_, sliced_indices);
+}
+
// ----------------------------------------------------------------------
-#define MAKE_PRIMITIVE_ARRAY_CASE(ENUM, ArrayType) \
- case Type::ENUM: \
- out->reset(new ArrayType(type, length, data, null_count, null_bitmap)); \
+#define MAKE_PRIMITIVE_ARRAY_CASE(ENUM, ArrayType) \
+ case Type::ENUM: \
+ out->reset(new ArrayType(type, length, data, null_bitmap, null_count, offset)); \
break;
Status MakePrimitiveArray(const std::shared_ptr<DataType>& type, int32_t length,
- const std::shared_ptr<Buffer>& data, int32_t null_count,
- const std::shared_ptr<Buffer>& null_bitmap, std::shared_ptr<Array>* out) {
+ const std::shared_ptr<Buffer>& data, const std::shared_ptr<Buffer>& null_bitmap,
+ int32_t null_count, int32_t offset, std::shared_ptr<Array>* out) {
switch (type->type) {
MAKE_PRIMITIVE_ARRAY_CASE(BOOL, BooleanArray);
MAKE_PRIMITIVE_ARRAY_CASE(UINT8, UInt8Array);
http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/array.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index 3b6e93f..f3e8f9a 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -27,6 +27,7 @@
#include "arrow/buffer.h"
#include "arrow/type.h"
#include "arrow/type_fwd.h"
+#include "arrow/type_traits.h"
#include "arrow/util/bit-util.h"
#include "arrow/util/macros.h"
#include "arrow/util/visibility.h"
@@ -71,23 +72,36 @@ class ArrayVisitor {
///
/// The base class is only required to have a null bitmap buffer if the null
/// count is greater than 0
+///
+/// If known, the null count can be provided in the base Array constructor. If
+/// the null count is not known, pass -1 to indicate that the null count is to
+/// be computed on the first call to null_count()
class ARROW_EXPORT Array {
public:
- Array(const std::shared_ptr<DataType>& type, int32_t length, int32_t null_count = 0,
- const std::shared_ptr<Buffer>& null_bitmap = nullptr);
+ Array(const std::shared_ptr<DataType>& type, int32_t length,
+ const std::shared_ptr<Buffer>& null_bitmap = nullptr, int32_t null_count = 0,
+ int32_t offset = 0);
virtual ~Array() = default;
/// Determine if a slot is null. For inner loops. Does *not* boundscheck
bool IsNull(int i) const {
- return null_count_ > 0 && BitUtil::BitNotSet(null_bitmap_data_, i);
+ return null_bitmap_data_ != nullptr &&
+ BitUtil::BitNotSet(null_bitmap_data_, i + offset_);
}
/// Size in the number of elements this array contains.
int32_t length() const { return length_; }
- /// The number of null entries in the array.
- int32_t null_count() const { return null_count_; }
+ /// A relative position into another array's data, to enable zero-copy
+ /// slicing. This value defaults to zero
+ int32_t offset() const { return offset_; }
+
+ /// The number of null entries in the array. If the null count was not known
+ /// at time of construction (and set to a negative value), then the null
+ /// count will be computed and cached on the first invocation of this
+ /// function
+ int32_t null_count() const;
std::shared_ptr<DataType> type() const { return type_; }
Type::type type_enum() const { return type_->type; }
@@ -95,11 +109,13 @@ class ARROW_EXPORT Array {
/// Buffer for the null bitmap.
///
/// Note that for `null_count == 0`, this can be a `nullptr`.
+ /// This buffer does not account for any slice offset
std::shared_ptr<Buffer> null_bitmap() const { return null_bitmap_; }
/// Raw pointer to the null bitmap.
///
/// Note that for `null_count == 0`, this can be a `nullptr`.
+ /// This buffer does not account for any slice offset
const uint8_t* null_bitmap_data() const { return null_bitmap_data_; }
bool Equals(const Array& arr) const;
@@ -120,10 +136,29 @@ class ARROW_EXPORT Array {
virtual Status Accept(ArrayVisitor* visitor) const = 0;
+ /// Construct a zero-copy slice of the array with the indicated offset and
+ /// length
+ ///
+ /// \param[in] offset the position of the first element in the constructed slice
+ /// \param[in] length the length of the slice. If there are not enough elements in the
+ /// array,
+ /// the length will be adjusted accordingly
+ ///
+ /// \return a new object wrapped in std::shared_ptr<Array>
+ virtual std::shared_ptr<Array> Slice(int32_t offset, int32_t length) const = 0;
+
+ /// Slice from offset until end of the array
+ std::shared_ptr<Array> Slice(int32_t offset) const;
+
protected:
std::shared_ptr<DataType> type_;
- int32_t null_count_;
int32_t length_;
+ int32_t offset_;
+
+ // This member is marked mutable so that it can be modified when null_count()
+ // is called from a const context and the null count has to be computed (if
+ // it is not already known)
+ mutable int32_t null_count_;
std::shared_ptr<Buffer> null_bitmap_;
const uint8_t* null_bitmap_data_;
@@ -138,28 +173,26 @@ class ARROW_EXPORT NullArray : public Array {
public:
using TypeClass = NullType;
- NullArray(const std::shared_ptr<DataType>& type, int32_t length)
- : Array(type, length, length, nullptr) {}
-
- explicit NullArray(int32_t length) : NullArray(std::make_shared<NullType>(), length) {}
+ explicit NullArray(int32_t length);
Status Accept(ArrayVisitor* visitor) const override;
-};
-Status ARROW_EXPORT GetEmptyBitmap(
- MemoryPool* pool, int32_t length, std::shared_ptr<MutableBuffer>* result);
+ std::shared_ptr<Array> Slice(int32_t offset, int32_t length) const override;
+};
/// Base class for fixed-size logical types
class ARROW_EXPORT PrimitiveArray : public Array {
public:
- virtual ~PrimitiveArray() {}
+ PrimitiveArray(const std::shared_ptr<DataType>& type, int32_t length,
+ const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap = nullptr, int32_t null_count = 0,
+ int32_t offset = 0);
+ /// The memory containing this array's data
+ /// This buffer does not account for any slice offset
std::shared_ptr<Buffer> data() const { return data_; }
protected:
- PrimitiveArray(const std::shared_ptr<DataType>& type, int32_t length,
- const std::shared_ptr<Buffer>& data, int32_t null_count = 0,
- const std::shared_ptr<Buffer>& null_bitmap = nullptr);
std::shared_ptr<Buffer> data_;
const uint8_t* raw_data_;
};
@@ -169,21 +202,28 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray {
public:
using TypeClass = TYPE;
using value_type = typename TypeClass::c_type;
- NumericArray(int32_t length, const std::shared_ptr<Buffer>& data,
- int32_t null_count = 0, const std::shared_ptr<Buffer>& null_bitmap = nullptr)
- : PrimitiveArray(
- std::make_shared<TypeClass>(), length, data, null_count, null_bitmap) {}
- NumericArray(const std::shared_ptr<DataType>& type, int32_t length,
- const std::shared_ptr<Buffer>& data, int32_t null_count = 0,
- const std::shared_ptr<Buffer>& null_bitmap = nullptr)
- : PrimitiveArray(type, length, data, null_count, null_bitmap) {}
+
+ using PrimitiveArray::PrimitiveArray;
+
+ // Only enable this constructor without a type argument for types without additional
+ // metadata
+ template <typename T1 = TYPE>
+ NumericArray(
+ typename std::enable_if<TypeTraits<T1>::is_parameter_free, int32_t>::type length,
+ const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap = nullptr, int32_t null_count = 0,
+ int32_t offset = 0)
+ : PrimitiveArray(TypeTraits<T1>::type_singleton(), length, data, null_bitmap,
+ null_count, offset) {}
const value_type* raw_data() const {
- return reinterpret_cast<const value_type*>(raw_data_);
+ return reinterpret_cast<const value_type*>(raw_data_) + offset_;
}
Status Accept(ArrayVisitor* visitor) const override;
+ std::shared_ptr<Array> Slice(int32_t offset, int32_t length) const override;
+
value_type Value(int i) const { return raw_data()[i]; }
};
@@ -191,17 +231,19 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray {
public:
using TypeClass = BooleanType;
+ using PrimitiveArray::PrimitiveArray;
+
BooleanArray(int32_t length, const std::shared_ptr<Buffer>& data,
- int32_t null_count = 0, const std::shared_ptr<Buffer>& null_bitmap = nullptr);
- BooleanArray(const std::shared_ptr<DataType>& type, int32_t length,
- const std::shared_ptr<Buffer>& data, int32_t null_count = 0,
- const std::shared_ptr<Buffer>& null_bitmap = nullptr);
+ const std::shared_ptr<Buffer>& null_bitmap = nullptr, int32_t null_count = 0,
+ int32_t offset = 0);
Status Accept(ArrayVisitor* visitor) const override;
- const uint8_t* raw_data() const { return reinterpret_cast<const uint8_t*>(raw_data_); }
+ std::shared_ptr<Array> Slice(int32_t offset, int32_t length) const override;
- bool Value(int i) const { return BitUtil::GetBit(raw_data(), i); }
+ bool Value(int i) const {
+ return BitUtil::GetBit(reinterpret_cast<const uint8_t*>(raw_data_), i + offset_);
+ }
};
// ----------------------------------------------------------------------
@@ -212,39 +254,45 @@ class ARROW_EXPORT ListArray : public Array {
using TypeClass = ListType;
ListArray(const std::shared_ptr<DataType>& type, int32_t length,
- const std::shared_ptr<Buffer>& offsets, const std::shared_ptr<Array>& values,
- int32_t null_count = 0, const std::shared_ptr<Buffer>& null_bitmap = nullptr)
- : Array(type, length, null_count, null_bitmap) {
- offsets_buffer_ = offsets;
- offsets_ = offsets == nullptr ? nullptr : reinterpret_cast<const int32_t*>(
- offsets_buffer_->data());
+ const std::shared_ptr<Buffer>& value_offsets, const std::shared_ptr<Array>& values,
+ const std::shared_ptr<Buffer>& null_bitmap = nullptr, int32_t null_count = 0,
+ int32_t offset = 0)
+ : Array(type, length, null_bitmap, null_count, offset) {
+ value_offsets_ = value_offsets;
+ raw_value_offsets_ = value_offsets == nullptr
+ ? nullptr
+ : reinterpret_cast<const int32_t*>(value_offsets_->data());
values_ = values;
}
Status Validate() const override;
- virtual ~ListArray() = default;
-
// Return a shared pointer in case the requestor desires to share ownership
// with this array.
std::shared_ptr<Array> values() const { return values_; }
- std::shared_ptr<Buffer> offsets() const { return offsets_buffer_; }
- std::shared_ptr<DataType> value_type() const { return values_->type(); }
+ /// Note that this buffer does not account for any slice offset
+ std::shared_ptr<Buffer> value_offsets() const { return value_offsets_; }
- const int32_t* raw_offsets() const { return offsets_; }
+ std::shared_ptr<DataType> value_type() const { return values_->type(); }
- int32_t offset(int i) const { return offsets_[i]; }
+ /// Return pointer to raw value offsets accounting for any slice offset
+ const int32_t* raw_value_offsets() const { return raw_value_offsets_ + offset_; }
// Neither of these functions will perform boundschecking
- int32_t value_offset(int i) const { return offsets_[i]; }
- int32_t value_length(int i) const { return offsets_[i + 1] - offsets_[i]; }
+ int32_t value_offset(int i) const { return raw_value_offsets_[i + offset_]; }
+ int32_t value_length(int i) const {
+ i += offset_;
+ return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
+ }
Status Accept(ArrayVisitor* visitor) const override;
+ std::shared_ptr<Array> Slice(int32_t offset, int32_t length) const override;
+
protected:
- std::shared_ptr<Buffer> offsets_buffer_;
- const int32_t* offsets_;
+ std::shared_ptr<Buffer> value_offsets_;
+ const int32_t* raw_value_offsets_;
std::shared_ptr<Array> values_;
};
@@ -255,55 +303,67 @@ class ARROW_EXPORT BinaryArray : public Array {
public:
using TypeClass = BinaryType;
- BinaryArray(int32_t length, const std::shared_ptr<Buffer>& offsets,
- const std::shared_ptr<Buffer>& data, int32_t null_count = 0,
- const std::shared_ptr<Buffer>& null_bitmap = nullptr);
-
- // Constructor that allows sub-classes/builders to propagate there logical type up the
- // class hierarchy.
- BinaryArray(const std::shared_ptr<DataType>& type, int32_t length,
- const std::shared_ptr<Buffer>& offsets, const std::shared_ptr<Buffer>& data,
- int32_t null_count = 0, const std::shared_ptr<Buffer>& null_bitmap = nullptr);
+ BinaryArray(int32_t length, const std::shared_ptr<Buffer>& value_offsets,
+ const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap = nullptr, int32_t null_count = 0,
+ int32_t offset = 0);
// Return the pointer to the given elements bytes
// TODO(emkornfield) introduce a StringPiece or something similar to capture zero-copy
// pointer + offset
const uint8_t* GetValue(int i, int32_t* out_length) const {
- const int32_t pos = offsets_[i];
- *out_length = offsets_[i + 1] - pos;
- return data_ + pos;
+ // Account for base offset
+ i += offset_;
+
+ const int32_t pos = raw_value_offsets_[i];
+ *out_length = raw_value_offsets_[i + 1] - pos;
+ return raw_data_ + pos;
}
- std::shared_ptr<Buffer> data() const { return data_buffer_; }
- std::shared_ptr<Buffer> offsets() const { return offsets_buffer_; }
+ /// Note that this buffer does not account for any slice offset
+ std::shared_ptr<Buffer> data() const { return data_; }
- const int32_t* raw_offsets() const { return offsets_; }
+ /// Note that this buffer does not account for any slice offset
+ std::shared_ptr<Buffer> value_offsets() const { return value_offsets_; }
- int32_t offset(int i) const { return offsets_[i]; }
+ const int32_t* raw_value_offsets() const { return raw_value_offsets_ + offset_; }
// Neither of these functions will perform boundschecking
- int32_t value_offset(int i) const { return offsets_[i]; }
- int32_t value_length(int i) const { return offsets_[i + 1] - offsets_[i]; }
+ int32_t value_offset(int i) const { return raw_value_offsets_[i + offset_]; }
+ int32_t value_length(int i) const {
+ i += offset_;
+ return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
+ }
Status Validate() const override;
Status Accept(ArrayVisitor* visitor) const override;
- private:
- std::shared_ptr<Buffer> offsets_buffer_;
- const int32_t* offsets_;
+ std::shared_ptr<Array> Slice(int32_t offset, int32_t length) const override;
+
+ protected:
+ // Constructor that allows sub-classes/builders to propagate there logical type up the
+ // class hierarchy.
+ BinaryArray(const std::shared_ptr<DataType>& type, int32_t length,
+ const std::shared_ptr<Buffer>& value_offsets, const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap = nullptr, int32_t null_count = 0,
+ int32_t offset = 0);
- std::shared_ptr<Buffer> data_buffer_;
- const uint8_t* data_;
+ std::shared_ptr<Buffer> value_offsets_;
+ const int32_t* raw_value_offsets_;
+
+ std::shared_ptr<Buffer> data_;
+ const uint8_t* raw_data_;
};
class ARROW_EXPORT StringArray : public BinaryArray {
public:
using TypeClass = StringType;
- StringArray(int32_t length, const std::shared_ptr<Buffer>& offsets,
- const std::shared_ptr<Buffer>& data, int32_t null_count = 0,
- const std::shared_ptr<Buffer>& null_bitmap = nullptr);
+ StringArray(int32_t length, const std::shared_ptr<Buffer>& value_offsets,
+ const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap = nullptr, int32_t null_count = 0,
+ int32_t offset = 0);
// Construct a std::string
// TODO: std::bad_alloc possibility
@@ -316,6 +376,8 @@ class ARROW_EXPORT StringArray : public BinaryArray {
Status Validate() const override;
Status Accept(ArrayVisitor* visitor) const override;
+
+ std::shared_ptr<Array> Slice(int32_t offset, int32_t length) const override;
};
// ----------------------------------------------------------------------
@@ -326,28 +388,25 @@ class ARROW_EXPORT StructArray : public Array {
using TypeClass = StructType;
StructArray(const std::shared_ptr<DataType>& type, int32_t length,
- const std::vector<std::shared_ptr<Array>>& field_arrays, int32_t null_count = 0,
- std::shared_ptr<Buffer> null_bitmap = nullptr)
- : Array(type, length, null_count, null_bitmap) {
- type_ = type;
- field_arrays_ = field_arrays;
- }
+ const std::vector<std::shared_ptr<Array>>& children,
+ std::shared_ptr<Buffer> null_bitmap = nullptr, int32_t null_count = 0,
+ int32_t offset = 0);
Status Validate() const override;
- virtual ~StructArray() {}
-
// Return a shared pointer in case the requestor desires to share ownership
// with this array.
std::shared_ptr<Array> field(int32_t pos) const;
- const std::vector<std::shared_ptr<Array>>& fields() const { return field_arrays_; }
+ const std::vector<std::shared_ptr<Array>>& fields() const { return children_; }
Status Accept(ArrayVisitor* visitor) const override;
+ std::shared_ptr<Array> Slice(int32_t offset, int32_t length) const override;
+
protected:
// The child arrays corresponding to each field of the struct data type.
- std::vector<std::shared_ptr<Array>> field_arrays_;
+ std::vector<std::shared_ptr<Array>> children_;
};
// ----------------------------------------------------------------------
@@ -356,22 +415,25 @@ class ARROW_EXPORT StructArray : public Array {
class ARROW_EXPORT UnionArray : public Array {
public:
using TypeClass = UnionType;
+ using type_id_t = uint8_t;
UnionArray(const std::shared_ptr<DataType>& type, int32_t length,
const std::vector<std::shared_ptr<Array>>& children,
const std::shared_ptr<Buffer>& type_ids,
- const std::shared_ptr<Buffer>& offsets = nullptr, int32_t null_count = 0,
- const std::shared_ptr<Buffer>& null_bitmap = nullptr);
+ const std::shared_ptr<Buffer>& value_offsets = nullptr,
+ const std::shared_ptr<Buffer>& null_bitmap = nullptr, int32_t null_count = 0,
+ int32_t offset = 0);
Status Validate() const override;
- virtual ~UnionArray() {}
+ /// Note that this buffer does not account for any slice offset
+ std::shared_ptr<Buffer> type_ids() const { return type_ids_; }
- std::shared_ptr<Buffer> type_ids() const { return type_ids_buffer_; }
- const uint8_t* raw_type_ids() const { return type_ids_; }
+ /// Note that this buffer does not account for any slice offset
+ std::shared_ptr<Buffer> value_offsets() const { return value_offsets_; }
- std::shared_ptr<Buffer> offsets() const { return offsets_buffer_; }
- const int32_t* raw_offsets() const { return offsets_; }
+ const type_id_t* raw_type_ids() const { return raw_type_ids_ + offset_; }
+ const int32_t* raw_value_offsets() const { return raw_value_offsets_ + offset_; }
UnionMode mode() const { return static_cast<const UnionType&>(*type_.get()).mode; }
@@ -381,14 +443,16 @@ class ARROW_EXPORT UnionArray : public Array {
Status Accept(ArrayVisitor* visitor) const override;
+ std::shared_ptr<Array> Slice(int32_t offset, int32_t length) const override;
+
protected:
std::vector<std::shared_ptr<Array>> children_;
- std::shared_ptr<Buffer> type_ids_buffer_;
- const uint8_t* type_ids_;
+ std::shared_ptr<Buffer> type_ids_;
+ const type_id_t* raw_type_ids_;
- std::shared_ptr<Buffer> offsets_buffer_;
- const int32_t* offsets_;
+ std::shared_ptr<Buffer> value_offsets_;
+ const int32_t* raw_value_offsets_;
};
// ----------------------------------------------------------------------
@@ -419,8 +483,8 @@ class ARROW_EXPORT DictionaryArray : public Array {
// Alternate ctor; other attributes (like null count) are inherited from the
// passed indices array
static Status FromBuffer(const std::shared_ptr<DataType>& type, int32_t length,
- const std::shared_ptr<Buffer>& indices, int32_t null_count,
- const std::shared_ptr<Buffer>& null_bitmap, std::shared_ptr<DictionaryArray>* out);
+ const std::shared_ptr<Buffer>& indices, const std::shared_ptr<Buffer>& null_bitmap,
+ int32_t null_count, int32_t offset, std::shared_ptr<DictionaryArray>* out);
Status Validate() const override;
@@ -431,6 +495,8 @@ class ARROW_EXPORT DictionaryArray : public Array {
Status Accept(ArrayVisitor* visitor) const override;
+ std::shared_ptr<Array> Slice(int32_t offset, int32_t length) const override;
+
protected:
const DictionaryType* dict_type_;
std::shared_ptr<Array> indices_;
@@ -471,8 +537,9 @@ extern template class ARROW_EXPORT NumericArray<TimeType>;
// Create new arrays for logical types that are backed by primitive arrays.
Status ARROW_EXPORT MakePrimitiveArray(const std::shared_ptr<DataType>& type,
- int32_t length, const std::shared_ptr<Buffer>& data, int32_t null_count,
- const std::shared_ptr<Buffer>& null_bitmap, std::shared_ptr<Array>* out);
+ int32_t length, const std::shared_ptr<Buffer>& data,
+ const std::shared_ptr<Buffer>& null_bitmap, int32_t null_count, int32_t offset,
+ std::shared_ptr<Array>* out);
} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/buffer.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/buffer.cc b/cpp/src/arrow/buffer.cc
index 6cce0ef..fb5a010 100644
--- a/cpp/src/arrow/buffer.cc
+++ b/cpp/src/arrow/buffer.cc
@@ -116,4 +116,20 @@ Status PoolBuffer::Resize(int64_t new_size, bool shrink_to_fit) {
return Status::OK();
}
+Status AllocateBuffer(
+ MemoryPool* pool, int64_t size, std::shared_ptr<MutableBuffer>* out) {
+ auto buffer = std::make_shared<PoolBuffer>(pool);
+ RETURN_NOT_OK(buffer->Resize(size));
+ *out = buffer;
+ return Status::OK();
+}
+
+Status AllocateResizableBuffer(
+ MemoryPool* pool, int64_t size, std::shared_ptr<ResizableBuffer>* out) {
+ auto buffer = std::make_shared<PoolBuffer>(pool);
+ RETURN_NOT_OK(buffer->Resize(size));
+ *out = buffer;
+ return Status::OK();
+}
+
} // namespace arrow
http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/buffer.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h
index d43ab03..9c400b1 100644
--- a/cpp/src/arrow/buffer.h
+++ b/cpp/src/arrow/buffer.h
@@ -15,8 +15,8 @@
// specific language governing permissions and limitations
// under the License.
-#ifndef ARROW_UTIL_BUFFER_H
-#define ARROW_UTIL_BUFFER_H
+#ifndef ARROW_BUFFER_H
+#define ARROW_BUFFER_H
#include <algorithm>
#include <cstdint>
@@ -105,7 +105,7 @@ class ARROW_EXPORT Buffer : public std::enable_shared_from_this<Buffer> {
/// Construct a view on passed buffer at the indicated offset and length. This
/// function cannot fail and does not error checking (except in debug builds)
-ARROW_EXPORT std::shared_ptr<Buffer> SliceBuffer(
+std::shared_ptr<Buffer> ARROW_EXPORT SliceBuffer(
const std::shared_ptr<Buffer>& buffer, int64_t offset, int64_t length);
/// A Buffer whose contents can be mutated. May or may not own its data.
@@ -232,6 +232,19 @@ class ARROW_EXPORT BufferBuilder {
int64_t size_;
};
+/// Allocate a new mutable buffer from a memory pool
+///
+/// \param[in] pool a memory pool
+/// \param[in] size size of buffer to allocate
+/// \param[out] out the allocated buffer with padding
+///
+/// \return Status message
+Status ARROW_EXPORT AllocateBuffer(
+ MemoryPool* pool, int64_t size, std::shared_ptr<MutableBuffer>* out);
+
+Status ARROW_EXPORT AllocateResizableBuffer(
+ MemoryPool* pool, int64_t size, std::shared_ptr<ResizableBuffer>* out);
+
} // namespace arrow
-#endif // ARROW_UTIL_BUFFER_H
+#endif // ARROW_BUFFER_H
http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/builder.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index b0dc41b..dddadee 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -185,7 +185,7 @@ Status PrimitiveBuilder<T>::Finish(std::shared_ptr<Array>* out) {
RETURN_NOT_OK(data_->Resize(bytes_required));
}
*out = std::make_shared<typename TypeTraits<T>::ArrayType>(
- type_, length_, data_, null_count_, null_bitmap_);
+ type_, length_, data_, null_bitmap_, null_count_);
data_ = null_bitmap_ = nullptr;
capacity_ = length_ = null_count_ = 0;
@@ -202,10 +202,19 @@ template class PrimitiveBuilder<Int32Type>;
template class PrimitiveBuilder<Int64Type>;
template class PrimitiveBuilder<DateType>;
template class PrimitiveBuilder<TimestampType>;
+template class PrimitiveBuilder<TimeType>;
template class PrimitiveBuilder<HalfFloatType>;
template class PrimitiveBuilder<FloatType>;
template class PrimitiveBuilder<DoubleType>;
+BooleanBuilder::BooleanBuilder(MemoryPool* pool)
+ : ArrayBuilder(pool, boolean()), data_(nullptr), raw_data_(nullptr) {}
+
+BooleanBuilder::BooleanBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type)
+ : BooleanBuilder(pool) {
+ DCHECK_EQ(Type::BOOL, type->type);
+}
+
Status BooleanBuilder::Init(int32_t capacity) {
RETURN_NOT_OK(ArrayBuilder::Init(capacity));
data_ = std::make_shared<PoolBuffer>(pool_);
@@ -244,7 +253,7 @@ Status BooleanBuilder::Finish(std::shared_ptr<Array>* out) {
// Trim buffers
RETURN_NOT_OK(data_->Resize(bytes_required));
}
- *out = std::make_shared<BooleanArray>(type_, length_, data_, null_count_, null_bitmap_);
+ *out = std::make_shared<BooleanArray>(type_, length_, data_, null_bitmap_, null_count_);
data_ = null_bitmap_ = nullptr;
capacity_ = length_ = null_count_ = 0;
@@ -313,7 +322,7 @@ Status ListBuilder::Finish(std::shared_ptr<Array>* out) {
std::shared_ptr<Buffer> offsets = offset_builder_.Finish();
*out = std::make_shared<ListArray>(
- type_, length_, offsets, items, null_count_, null_bitmap_);
+ type_, length_, offsets, items, null_bitmap_, null_count_);
Reset();
@@ -333,14 +342,13 @@ std::shared_ptr<ArrayBuilder> ListBuilder::value_builder() const {
// ----------------------------------------------------------------------
// String and binary
-// This used to be a static member variable of BinaryBuilder, but it can cause
-// valgrind to report a (spurious?) memory leak when needed in other shared
-// libraries. The problem came up while adding explicit visibility to libarrow
-// and libparquet_arrow
-static TypePtr kBinaryValueType = TypePtr(new UInt8Type());
+BinaryBuilder::BinaryBuilder(MemoryPool* pool)
+ : ListBuilder(pool, std::make_shared<UInt8Builder>(pool, uint8()), binary()) {
+ byte_builder_ = static_cast<UInt8Builder*>(value_builder_.get());
+}
BinaryBuilder::BinaryBuilder(MemoryPool* pool, const TypePtr& type)
- : ListBuilder(pool, std::make_shared<UInt8Builder>(pool, kBinaryValueType), type) {
+ : ListBuilder(pool, std::make_shared<UInt8Builder>(pool, uint8()), type) {
byte_builder_ = static_cast<UInt8Builder*>(value_builder_.get());
}
@@ -351,11 +359,13 @@ Status BinaryBuilder::Finish(std::shared_ptr<Array>* out) {
const auto list = std::dynamic_pointer_cast<ListArray>(result);
auto values = std::dynamic_pointer_cast<UInt8Array>(list->values());
- *out = std::make_shared<BinaryArray>(list->length(), list->offsets(), values->data(),
- list->null_count(), list->null_bitmap());
+ *out = std::make_shared<BinaryArray>(list->length(), list->value_offsets(),
+ values->data(), list->null_bitmap(), list->null_count());
return Status::OK();
}
+StringBuilder::StringBuilder(MemoryPool* pool) : BinaryBuilder(pool, utf8()) {}
+
Status StringBuilder::Finish(std::shared_ptr<Array>* out) {
std::shared_ptr<Array> result;
RETURN_NOT_OK(ListBuilder::Finish(&result));
@@ -363,8 +373,8 @@ Status StringBuilder::Finish(std::shared_ptr<Array>* out) {
const auto list = std::dynamic_pointer_cast<ListArray>(result);
auto values = std::dynamic_pointer_cast<UInt8Array>(list->values());
- *out = std::make_shared<StringArray>(list->length(), list->offsets(), values->data(),
- list->null_count(), list->null_bitmap());
+ *out = std::make_shared<StringArray>(list->length(), list->value_offsets(),
+ values->data(), list->null_bitmap(), list->null_count());
return Status::OK();
}
@@ -377,7 +387,7 @@ Status StructBuilder::Finish(std::shared_ptr<Array>* out) {
RETURN_NOT_OK(field_builders_[i]->Finish(&fields[i]));
}
- *out = std::make_shared<StructArray>(type_, length_, fields, null_count_, null_bitmap_);
+ *out = std::make_shared<StructArray>(type_, length_, fields, null_bitmap_, null_count_);
null_bitmap_ = nullptr;
capacity_ = length_ = null_count_ = 0;
@@ -393,9 +403,9 @@ std::shared_ptr<ArrayBuilder> StructBuilder::field_builder(int pos) const {
// ----------------------------------------------------------------------
// Helper functions
-#define BUILDER_CASE(ENUM, BuilderType) \
- case Type::ENUM: \
- out->reset(new BuilderType(pool, type)); \
+#define BUILDER_CASE(ENUM, BuilderType) \
+ case Type::ENUM: \
+ out->reset(new BuilderType(pool)); \
return Status::OK();
// Initially looked at doing this with vtables, but shared pointers makes it
@@ -414,19 +424,17 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
BUILDER_CASE(UINT64, UInt64Builder);
BUILDER_CASE(INT64, Int64Builder);
BUILDER_CASE(DATE, DateBuilder);
- BUILDER_CASE(TIMESTAMP, TimestampBuilder);
-
- BUILDER_CASE(BOOL, BooleanBuilder);
-
- BUILDER_CASE(FLOAT, FloatBuilder);
- BUILDER_CASE(DOUBLE, DoubleBuilder);
-
- case Type::STRING:
- out->reset(new StringBuilder(pool));
+ case Type::TIMESTAMP:
+ out->reset(new TimestampBuilder(pool, type));
return Status::OK();
- case Type::BINARY:
- out->reset(new BinaryBuilder(pool, type));
+ case Type::TIME:
+ out->reset(new TimeBuilder(pool, type));
return Status::OK();
+ BUILDER_CASE(BOOL, BooleanBuilder);
+ BUILDER_CASE(FLOAT, FloatBuilder);
+ BUILDER_CASE(DOUBLE, DoubleBuilder);
+ BUILDER_CASE(STRING, StringBuilder);
+ BUILDER_CASE(BINARY, BinaryBuilder);
case Type::LIST: {
std::shared_ptr<ArrayBuilder> value_builder;
std::shared_ptr<DataType> value_type =
http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/builder.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index 672d2d8..0b83b9f 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -141,9 +141,7 @@ class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder {
using value_type = typename Type::c_type;
explicit PrimitiveBuilder(MemoryPool* pool, const TypePtr& type)
- : ArrayBuilder(pool, type), data_(nullptr) {}
-
- virtual ~PrimitiveBuilder() {}
+ : ArrayBuilder(pool, type), data_(nullptr), raw_data_(nullptr) {}
using ArrayBuilder::Advance;
@@ -233,6 +231,7 @@ using Int16Builder = NumericBuilder<Int16Type>;
using Int32Builder = NumericBuilder<Int32Type>;
using Int64Builder = NumericBuilder<Int64Type>;
using TimestampBuilder = NumericBuilder<TimestampType>;
+using TimeBuilder = NumericBuilder<TimeType>;
using DateBuilder = NumericBuilder<DateType>;
using HalfFloatBuilder = NumericBuilder<HalfFloatType>;
@@ -241,10 +240,8 @@ using DoubleBuilder = NumericBuilder<DoubleType>;
class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
public:
- explicit BooleanBuilder(MemoryPool* pool, const TypePtr& type = boolean())
- : ArrayBuilder(pool, type), data_(nullptr) {}
-
- virtual ~BooleanBuilder() {}
+ explicit BooleanBuilder(MemoryPool* pool);
+ explicit BooleanBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type);
using ArrayBuilder::Advance;
@@ -321,8 +318,6 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder {
ListBuilder(
MemoryPool* pool, std::shared_ptr<Array> values, const TypePtr& type = nullptr);
- virtual ~ListBuilder() {}
-
Status Init(int32_t elements) override;
Status Resize(int32_t capacity) override;
Status Finish(std::shared_ptr<Array>* out) override;
@@ -368,8 +363,8 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder {
// BinaryBuilder : public ListBuilder
class ARROW_EXPORT BinaryBuilder : public ListBuilder {
public:
+ explicit BinaryBuilder(MemoryPool* pool);
explicit BinaryBuilder(MemoryPool* pool, const TypePtr& type);
- virtual ~BinaryBuilder() {}
Status Append(const uint8_t* value, int32_t length) {
RETURN_NOT_OK(ListBuilder::Append());
@@ -391,11 +386,7 @@ class ARROW_EXPORT BinaryBuilder : public ListBuilder {
// String builder
class ARROW_EXPORT StringBuilder : public BinaryBuilder {
public:
- explicit StringBuilder(MemoryPool* pool = default_memory_pool())
- : BinaryBuilder(pool, utf8()) {}
-
- explicit StringBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type)
- : BinaryBuilder(pool, type) {}
+ explicit StringBuilder(MemoryPool* pool);
using BinaryBuilder::Append;
http://git-wip-us.apache.org/repos/asf/arrow/blob/5439b715/cpp/src/arrow/column-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/column-test.cc b/cpp/src/arrow/column-test.cc
index 1e722ed..0bbfc83 100644
--- a/cpp/src/arrow/column-test.cc
+++ b/cpp/src/arrow/column-test.cc
@@ -51,7 +51,7 @@ TEST_F(TestChunkedArray, BasicEquals) {
std::vector<bool> null_bitmap(100, true);
std::vector<int32_t> data(100, 1);
std::shared_ptr<Array> array;
- ArrayFromVector<Int32Type, int32_t>(int32(), null_bitmap, data, &array);
+ ArrayFromVector<Int32Type, int32_t>(null_bitmap, data, &array);
arrays_one_.push_back(array);
arrays_another_.push_back(array);
@@ -67,9 +67,9 @@ TEST_F(TestChunkedArray, EqualsDifferingTypes) {
std::vector<int32_t> data32(100, 1);
std::vector<int64_t> data64(100, 1);
std::shared_ptr<Array> array;
- ArrayFromVector<Int32Type, int32_t>(int32(), null_bitmap, data32, &array);
+ ArrayFromVector<Int32Type, int32_t>(null_bitmap, data32, &array);
arrays_one_.push_back(array);
- ArrayFromVector<Int64Type, int64_t>(int64(), null_bitmap, data64, &array);
+ ArrayFromVector<Int64Type, int64_t>(null_bitmap, data64, &array);
arrays_another_.push_back(array);
Construct();
@@ -83,9 +83,9 @@ TEST_F(TestChunkedArray, EqualsDifferingLengths) {
std::vector<int32_t> data100(100, 1);
std::vector<int32_t> data101(101, 1);
std::shared_ptr<Array> array;
- ArrayFromVector<Int32Type, int32_t>(int32(), null_bitmap100, data100, &array);
+ ArrayFromVector<Int32Type, int32_t>(null_bitmap100, data100, &array);
arrays_one_.push_back(array);
- ArrayFromVector<Int32Type, int32_t>(int32(), null_bitmap101, data101, &array);
+ ArrayFromVector<Int32Type, int32_t>(null_bitmap101, data101, &array);
arrays_another_.push_back(array);
Construct();
@@ -94,7 +94,7 @@ TEST_F(TestChunkedArray, EqualsDifferingLengths) {
std::vector<bool> null_bitmap1(1, true);
std::vector<int32_t> data1(1, 1);
- ArrayFromVector<Int32Type, int32_t>(int32(), null_bitmap1, data1, &array);
+ ArrayFromVector<Int32Type, int32_t>(null_bitmap1, data1, &array);
arrays_one_.push_back(array);
Construct();
@@ -156,7 +156,7 @@ TEST_F(TestColumn, Equals) {
std::vector<bool> null_bitmap(100, true);
std::vector<int32_t> data(100, 1);
std::shared_ptr<Array> array;
- ArrayFromVector<Int32Type, int32_t>(int32(), null_bitmap, data, &array);
+ ArrayFromVector<Int32Type, int32_t>(null_bitmap, data, &array);
arrays_one_.push_back(array);
arrays_another_.push_back(array);