You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by em...@apache.org on 2019/08/01 03:36:57 UTC
[arrow] branch master updated: ARROW-4810: [Format] [C++] Add
LargeList type
This is an automated email from the ASF dual-hosted git repository.
emkornfield pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 8cdf567 ARROW-4810: [Format] [C++] Add LargeList type
8cdf567 is described below
commit 8cdf567480cfa4d1d6921fcc8b91f1c714225baa
Author: Antoine Pitrou <an...@python.org>
AuthorDate: Wed Jul 31 20:36:22 2019 -0700
ARROW-4810: [Format] [C++] Add LargeList type
Closes #4969 from pitrou/ARROW-4810-large-list and squashes the following commits:
a1d93b26b <Antoine Pitrou> ARROW-4810: Add LargeList type
Authored-by: Antoine Pitrou <an...@python.org>
Signed-off-by: Micah Kornfield <em...@gmail.com>
---
cpp/src/arrow/array-list-test.cc | 459 ++++++++++++++------------
cpp/src/arrow/array.cc | 144 +++++---
cpp/src/arrow/array.h | 103 ++++--
cpp/src/arrow/array/builder_nested.cc | 110 ------
cpp/src/arrow/array/builder_nested.h | 182 ++++++++--
cpp/src/arrow/array/concatenate-test.cc | 21 +-
cpp/src/arrow/array/concatenate.cc | 8 +
cpp/src/arrow/builder.cc | 8 +
cpp/src/arrow/compare.cc | 55 ++-
cpp/src/arrow/compute/kernels/cast-test.cc | 28 ++
cpp/src/arrow/compute/kernels/cast.cc | 19 +-
cpp/src/arrow/compute/kernels/filter-test.cc | 9 +
cpp/src/arrow/compute/kernels/take-internal.h | 47 +--
cpp/src/arrow/compute/kernels/take-test.cc | 9 +
cpp/src/arrow/extension_type.cc | 2 -
cpp/src/arrow/extension_type.h | 5 +-
cpp/src/arrow/ipc/json-internal.cc | 47 ++-
cpp/src/arrow/ipc/json-simple-test.cc | 15 +
cpp/src/arrow/ipc/json-simple.cc | 17 +-
cpp/src/arrow/ipc/json-test.cc | 23 +-
cpp/src/arrow/ipc/metadata-internal.cc | 13 +
cpp/src/arrow/ipc/reader.cc | 31 +-
cpp/src/arrow/ipc/test-common.cc | 51 ++-
cpp/src/arrow/ipc/test-common.h | 5 +
cpp/src/arrow/ipc/writer.cc | 13 +-
cpp/src/arrow/pretty_print-test.cc | 13 +-
cpp/src/arrow/pretty_print.cc | 2 +
cpp/src/arrow/scalar.cc | 8 +-
cpp/src/arrow/scalar.h | 16 +-
cpp/src/arrow/type-test.cc | 35 ++
cpp/src/arrow/type.cc | 14 +
cpp/src/arrow/type.h | 125 ++++++-
cpp/src/arrow/type_fwd.h | 5 +
cpp/src/arrow/type_traits.h | 22 ++
cpp/src/arrow/visitor.cc | 3 +
cpp/src/arrow/visitor.h | 3 +
cpp/src/arrow/visitor_inline.h | 1 +
cpp/src/parquet/arrow/writer.cc | 3 +
format/Schema.fbs | 6 +
39 files changed, 1123 insertions(+), 557 deletions(-)
diff --git a/cpp/src/arrow/array-list-test.cc b/cpp/src/arrow/array-list-test.cc
index c3118e9..9adaf04 100644
--- a/cpp/src/arrow/array-list-test.cc
+++ b/cpp/src/arrow/array-list-test.cc
@@ -37,308 +37,345 @@ namespace arrow {
using internal::checked_cast;
using internal::checked_pointer_cast;
+using ListTypes = ::testing::Types<ListType, LargeListType>;
+
// ----------------------------------------------------------------------
// List tests
+template <typename T>
class TestListArray : public TestBuilder {
public:
+ using TypeClass = T;
+ using offset_type = typename TypeClass::offset_type;
+ using ArrayType = typename TypeTraits<TypeClass>::ArrayType;
+ using BuilderType = typename TypeTraits<TypeClass>::BuilderType;
+ using OffsetType = typename TypeTraits<TypeClass>::OffsetType;
+ using OffsetArrayType = typename TypeTraits<TypeClass>::OffsetArrayType;
+ using OffsetBuilderType = typename TypeTraits<TypeClass>::OffsetBuilderType;
+
void SetUp() {
TestBuilder::SetUp();
- value_type_ = int32();
- type_ = list(value_type_);
+ value_type_ = int16();
+ type_ = std::make_shared<T>(value_type_);
std::unique_ptr<ArrayBuilder> tmp;
ASSERT_OK(MakeBuilder(pool_, type_, &tmp));
- builder_.reset(checked_cast<ListBuilder*>(tmp.release()));
+ builder_.reset(checked_cast<BuilderType*>(tmp.release()));
}
void Done() {
std::shared_ptr<Array> out;
FinishAndCheckPadding(builder_.get(), &out);
- result_ = std::dynamic_pointer_cast<ListArray>(out);
+ result_ = std::dynamic_pointer_cast<ArrayType>(out);
}
- protected:
- std::shared_ptr<DataType> value_type_;
+ void ValidateBasicListArray(const ArrayType* result, const std::vector<int16_t>& values,
+ const std::vector<uint8_t>& is_valid) {
+ ASSERT_OK(ValidateArray(*result));
+ ASSERT_EQ(1, result->null_count());
+ ASSERT_EQ(0, result->values()->null_count());
- std::shared_ptr<ListBuilder> builder_;
- std::shared_ptr<ListArray> result_;
-};
+ ASSERT_EQ(3, result->length());
+ std::vector<offset_type> ex_offsets = {0, 3, 3, 7};
+ for (size_t i = 0; i < ex_offsets.size(); ++i) {
+ ASSERT_EQ(ex_offsets[i], result->value_offset(i));
+ }
-TEST_F(TestListArray, Equality) {
- Int32Builder* vb = checked_cast<Int32Builder*>(builder_->value_builder());
+ for (int i = 0; i < result->length(); ++i) {
+ ASSERT_EQ(is_valid[i] == 0, result->IsNull(i));
+ }
- std::shared_ptr<Array> array, equal_array, unequal_array;
- std::vector<int32_t> equal_offsets = {0, 1, 2, 5, 6, 7, 8, 10};
- std::vector<int32_t> equal_values = {1, 2, 3, 4, 5, 2, 2, 2, 5, 6};
- std::vector<int32_t> unequal_offsets = {0, 1, 4, 7};
- std::vector<int32_t> unequal_values = {1, 2, 2, 2, 3, 4, 5};
+ ASSERT_EQ(7, result->values()->length());
+ auto varr = std::dynamic_pointer_cast<Int16Array>(result->values());
- // setup two equal arrays
- ASSERT_OK(builder_->AppendValues(equal_offsets.data(), equal_offsets.size()));
- ASSERT_OK(vb->AppendValues(equal_values.data(), equal_values.size()));
+ for (size_t i = 0; i < values.size(); ++i) {
+ ASSERT_EQ(values[i], varr->Value(i));
+ }
+ }
- ASSERT_OK(builder_->Finish(&array));
- ASSERT_OK(builder_->AppendValues(equal_offsets.data(), equal_offsets.size()));
- ASSERT_OK(vb->AppendValues(equal_values.data(), equal_values.size()));
+ void TestBasics() {
+ std::vector<int16_t> values = {0, 1, 2, 3, 4, 5, 6};
+ std::vector<int> lengths = {3, 0, 4};
+ std::vector<uint8_t> is_valid = {1, 0, 1};
- ASSERT_OK(builder_->Finish(&equal_array));
- // now an unequal one
- ASSERT_OK(builder_->AppendValues(unequal_offsets.data(), unequal_offsets.size()));
- ASSERT_OK(vb->AppendValues(unequal_values.data(), unequal_values.size()));
+ Int16Builder* vb = checked_cast<Int16Builder*>(builder_->value_builder());
- ASSERT_OK(builder_->Finish(&unequal_array));
+ ASSERT_OK(builder_->Reserve(lengths.size()));
+ ASSERT_OK(vb->Reserve(values.size()));
- // Test array equality
- EXPECT_TRUE(array->Equals(array));
- EXPECT_TRUE(array->Equals(equal_array));
- EXPECT_TRUE(equal_array->Equals(array));
- EXPECT_FALSE(equal_array->Equals(unequal_array));
- EXPECT_FALSE(unequal_array->Equals(equal_array));
+ int pos = 0;
+ for (size_t i = 0; i < lengths.size(); ++i) {
+ ASSERT_OK(builder_->Append(is_valid[i] > 0));
+ for (int j = 0; j < lengths[i]; ++j) {
+ ASSERT_OK(vb->Append(values[pos++]));
+ }
+ }
- // Test range equality
- EXPECT_TRUE(array->RangeEquals(0, 1, 0, unequal_array));
- EXPECT_FALSE(array->RangeEquals(0, 2, 0, unequal_array));
- EXPECT_FALSE(array->RangeEquals(1, 2, 1, unequal_array));
- EXPECT_TRUE(array->RangeEquals(2, 3, 2, unequal_array));
+ Done();
+ ValidateBasicListArray(result_.get(), values, is_valid);
+ }
- // Check with slices, ARROW-33
- std::shared_ptr<Array> slice, slice2;
+ void TestEquality() {
+ auto vb = checked_cast<Int16Builder*>(builder_->value_builder());
- slice = array->Slice(2);
- slice2 = array->Slice(2);
- ASSERT_EQ(array->length() - 2, slice->length());
+ std::shared_ptr<Array> array, equal_array, unequal_array;
+ std::vector<offset_type> equal_offsets = {0, 1, 2, 5, 6, 7, 8, 10};
+ std::vector<int16_t> equal_values = {1, 2, 3, 4, 5, 2, 2, 2, 5, 6};
+ std::vector<offset_type> unequal_offsets = {0, 1, 4, 7};
+ std::vector<int16_t> unequal_values = {1, 2, 2, 2, 3, 4, 5};
- ASSERT_TRUE(slice->Equals(slice2));
- ASSERT_TRUE(array->RangeEquals(2, slice->length(), 0, slice));
+ // setup two equal arrays
+ ASSERT_OK(builder_->AppendValues(equal_offsets.data(), equal_offsets.size()));
+ ASSERT_OK(vb->AppendValues(equal_values.data(), equal_values.size()));
- // Chained slices
- slice2 = array->Slice(1)->Slice(1);
- ASSERT_TRUE(slice->Equals(slice2));
+ ASSERT_OK(builder_->Finish(&array));
+ ASSERT_OK(builder_->AppendValues(equal_offsets.data(), equal_offsets.size()));
+ ASSERT_OK(vb->AppendValues(equal_values.data(), equal_values.size()));
- slice = array->Slice(1, 4);
- slice2 = array->Slice(1, 4);
- ASSERT_EQ(4, slice->length());
+ ASSERT_OK(builder_->Finish(&equal_array));
+ // now an unequal one
+ ASSERT_OK(builder_->AppendValues(unequal_offsets.data(), unequal_offsets.size()));
+ ASSERT_OK(vb->AppendValues(unequal_values.data(), unequal_values.size()));
- ASSERT_TRUE(slice->Equals(slice2));
- ASSERT_TRUE(array->RangeEquals(1, 5, 0, slice));
-}
+ ASSERT_OK(builder_->Finish(&unequal_array));
-TEST_F(TestListArray, ValuesEquality) {
- auto type = list(int32());
- auto left = ArrayFromJSON(type, "[[1, 2], [3], [0]]");
- auto right = ArrayFromJSON(type, "[[1, 2], [3], [100000]]");
- auto offset = 2;
- EXPECT_FALSE(left->Slice(offset)->Equals(right->Slice(offset)));
-}
+ // Test array equality
+ EXPECT_TRUE(array->Equals(array));
+ EXPECT_TRUE(array->Equals(equal_array));
+ EXPECT_TRUE(equal_array->Equals(array));
+ EXPECT_FALSE(equal_array->Equals(unequal_array));
+ EXPECT_FALSE(unequal_array->Equals(equal_array));
-TEST_F(TestListArray, TestResize) {}
+ // Test range equality
+ EXPECT_TRUE(array->RangeEquals(0, 1, 0, unequal_array));
+ EXPECT_FALSE(array->RangeEquals(0, 2, 0, unequal_array));
+ EXPECT_FALSE(array->RangeEquals(1, 2, 1, unequal_array));
+ EXPECT_TRUE(array->RangeEquals(2, 3, 2, unequal_array));
-TEST_F(TestListArray, TestFromArrays) {
- std::shared_ptr<Array> offsets1, offsets2, offsets3, offsets4, values;
+ // Check with slices, ARROW-33
+ std::shared_ptr<Array> slice, slice2;
- std::vector<bool> offsets_is_valid3 = {true, false, true, true};
- std::vector<bool> offsets_is_valid4 = {true, true, false, true};
+ slice = array->Slice(2);
+ slice2 = array->Slice(2);
+ ASSERT_EQ(array->length() - 2, slice->length());
- std::vector<bool> values_is_valid = {true, false, true, true, true, true};
+ ASSERT_TRUE(slice->Equals(slice2));
+ ASSERT_TRUE(array->RangeEquals(2, slice->length(), 0, slice));
- std::vector<int32_t> offset1_values = {0, 2, 2, 6};
- std::vector<int32_t> offset2_values = {0, 2, 6, 6};
+ // Chained slices
+ slice2 = array->Slice(1)->Slice(1);
+ ASSERT_TRUE(slice->Equals(slice2));
- std::vector<int8_t> values_values = {0, 1, 2, 3, 4, 5};
- const int length = 3;
+ slice = array->Slice(1, 4);
+ slice2 = array->Slice(1, 4);
+ ASSERT_EQ(4, slice->length());
- ArrayFromVector<Int32Type, int32_t>(offset1_values, &offsets1);
- ArrayFromVector<Int32Type, int32_t>(offset2_values, &offsets2);
+ ASSERT_TRUE(slice->Equals(slice2));
+ ASSERT_TRUE(array->RangeEquals(1, 5, 0, slice));
+ }
- ArrayFromVector<Int32Type, int32_t>(offsets_is_valid3, offset1_values, &offsets3);
- ArrayFromVector<Int32Type, int32_t>(offsets_is_valid4, offset2_values, &offsets4);
+ void TestValuesEquality() {
+ auto type = std::make_shared<T>(int32());
+ auto left = ArrayFromJSON(type, "[[1, 2], [3], [0]]");
+ auto right = ArrayFromJSON(type, "[[1, 2], [3], [100000]]");
+ auto offset = 2;
+ EXPECT_FALSE(left->Slice(offset)->Equals(right->Slice(offset)));
+ }
- ArrayFromVector<Int8Type, int8_t>(values_is_valid, values_values, &values);
+ void TestFromArrays() {
+ std::shared_ptr<Array> offsets1, offsets2, offsets3, offsets4, values;
- auto list_type = list(int8());
+ std::vector<bool> offsets_is_valid3 = {true, false, true, true};
+ std::vector<bool> offsets_is_valid4 = {true, true, false, true};
- std::shared_ptr<Array> list1, list3, list4;
- ASSERT_OK(ListArray::FromArrays(*offsets1, *values, pool_, &list1));
- ASSERT_OK(ListArray::FromArrays(*offsets3, *values, pool_, &list3));
- ASSERT_OK(ListArray::FromArrays(*offsets4, *values, pool_, &list4));
+ std::vector<bool> values_is_valid = {true, false, true, true, true, true};
- ListArray expected1(list_type, length, offsets1->data()->buffers[1], values,
- offsets1->data()->buffers[0], 0);
- AssertArraysEqual(expected1, *list1);
+ std::vector<offset_type> offset1_values = {0, 2, 2, 6};
+ std::vector<offset_type> offset2_values = {0, 2, 6, 6};
- // Use null bitmap from offsets3, but clean offsets from non-null version
- ListArray expected3(list_type, length, offsets1->data()->buffers[1], values,
- offsets3->data()->buffers[0], 1);
- AssertArraysEqual(expected3, *list3);
+ std::vector<int8_t> values_values = {0, 1, 2, 3, 4, 5};
+ const int length = 3;
- // Check that the last offset bit is zero
- ASSERT_FALSE(BitUtil::GetBit(list3->null_bitmap()->data(), length + 1));
+ ArrayFromVector<OffsetType, offset_type>(offset1_values, &offsets1);
+ ArrayFromVector<OffsetType, offset_type>(offset2_values, &offsets2);
- ListArray expected4(list_type, length, offsets2->data()->buffers[1], values,
- offsets4->data()->buffers[0], 1);
- AssertArraysEqual(expected4, *list4);
+ ArrayFromVector<OffsetType, offset_type>(offsets_is_valid3, offset1_values,
+ &offsets3);
+ ArrayFromVector<OffsetType, offset_type>(offsets_is_valid4, offset2_values,
+ &offsets4);
- // Test failure modes
+ ArrayFromVector<Int8Type, int8_t>(values_is_valid, values_values, &values);
- std::shared_ptr<Array> tmp;
+ auto list_type = std::make_shared<T>(int8());
- // Zero-length offsets
- ASSERT_RAISES(Invalid,
- ListArray::FromArrays(*offsets1->Slice(0, 0), *values, pool_, &tmp));
+ std::shared_ptr<Array> list1, list3, list4;
+ ASSERT_OK(ArrayType::FromArrays(*offsets1, *values, pool_, &list1));
+ ASSERT_OK(ArrayType::FromArrays(*offsets3, *values, pool_, &list3));
+ ASSERT_OK(ArrayType::FromArrays(*offsets4, *values, pool_, &list4));
+ ASSERT_OK(ValidateArray(*list1));
+ ASSERT_OK(ValidateArray(*list3));
+ ASSERT_OK(ValidateArray(*list4));
- // Offsets not int32
- ASSERT_RAISES(TypeError, ListArray::FromArrays(*values, *offsets1, pool_, &tmp));
-}
+ ArrayType expected1(list_type, length, offsets1->data()->buffers[1], values,
+ offsets1->data()->buffers[0], 0);
+ AssertArraysEqual(expected1, *list1);
-TEST_F(TestListArray, TestAppendNull) {
- ASSERT_OK(builder_->AppendNull());
- ASSERT_OK(builder_->AppendNull());
+ // Use null bitmap from offsets3, but clean offsets from non-null version
+ ArrayType expected3(list_type, length, offsets1->data()->buffers[1], values,
+ offsets3->data()->buffers[0], 1);
+ AssertArraysEqual(expected3, *list3);
- Done();
+ // Check that the last offset bit is zero
+ ASSERT_FALSE(BitUtil::GetBit(list3->null_bitmap()->data(), length + 1));
- ASSERT_OK(ValidateArray(*result_));
- ASSERT_TRUE(result_->IsNull(0));
- ASSERT_TRUE(result_->IsNull(1));
+ ArrayType expected4(list_type, length, offsets2->data()->buffers[1], values,
+ offsets4->data()->buffers[0], 1);
+ AssertArraysEqual(expected4, *list4);
- ASSERT_EQ(0, result_->raw_value_offsets()[0]);
- ASSERT_EQ(0, result_->value_offset(1));
- ASSERT_EQ(0, result_->value_offset(2));
+ // Test failure modes
- auto values = result_->values();
- ASSERT_EQ(0, values->length());
- // Values buffer should be non-null
- ASSERT_NE(nullptr, values->data()->buffers[1]);
-}
+ std::shared_ptr<Array> tmp;
-TEST_F(TestListArray, TestAppendNulls) {
- ASSERT_OK(builder_->AppendNulls(3));
+ // Zero-length offsets
+ ASSERT_RAISES(Invalid,
+ ArrayType::FromArrays(*offsets1->Slice(0, 0), *values, pool_, &tmp));
- Done();
+ // Offsets not the right type
+ ASSERT_RAISES(TypeError, ArrayType::FromArrays(*values, *offsets1, pool_, &tmp));
+ }
- ASSERT_OK(ValidateArray(*result_));
- ASSERT_EQ(result_->length(), 3);
- ASSERT_EQ(result_->null_count(), 3);
- ASSERT_TRUE(result_->IsNull(0));
- ASSERT_TRUE(result_->IsNull(1));
- ASSERT_TRUE(result_->IsNull(2));
+ void TestAppendNull() {
+ ASSERT_OK(builder_->AppendNull());
+ ASSERT_OK(builder_->AppendNull());
- ASSERT_EQ(0, result_->raw_value_offsets()[0]);
- ASSERT_EQ(0, result_->value_offset(1));
- ASSERT_EQ(0, result_->value_offset(2));
- ASSERT_EQ(0, result_->value_offset(3));
+ Done();
- auto values = result_->values();
- ASSERT_EQ(0, values->length());
- // Values buffer should be non-null
- ASSERT_NE(nullptr, values->data()->buffers[1]);
-}
+ ASSERT_OK(ValidateArray(*result_));
+ ASSERT_TRUE(result_->IsNull(0));
+ ASSERT_TRUE(result_->IsNull(1));
-void ValidateBasicListArray(const ListArray* result, const std::vector<int32_t>& values,
- const std::vector<uint8_t>& is_valid) {
- ASSERT_OK(ValidateArray(*result));
- ASSERT_EQ(1, result->null_count());
- ASSERT_EQ(0, result->values()->null_count());
+ ASSERT_EQ(0, result_->raw_value_offsets()[0]);
+ ASSERT_EQ(0, result_->value_offset(1));
+ ASSERT_EQ(0, result_->value_offset(2));
- ASSERT_EQ(3, result->length());
- std::vector<int32_t> ex_offsets = {0, 3, 3, 7};
- for (size_t i = 0; i < ex_offsets.size(); ++i) {
- ASSERT_EQ(ex_offsets[i], result->value_offset(i));
+ auto values = result_->values();
+ ASSERT_EQ(0, values->length());
+ // Values buffer should be non-null
+ ASSERT_NE(nullptr, values->data()->buffers[1]);
}
- for (int i = 0; i < result->length(); ++i) {
- ASSERT_EQ(is_valid[i] == 0, result->IsNull(i));
- }
+ void TestAppendNulls() {
+ ASSERT_OK(builder_->AppendNulls(3));
- ASSERT_EQ(7, result->values()->length());
- auto varr = std::dynamic_pointer_cast<Int32Array>(result->values());
+ Done();
- for (size_t i = 0; i < values.size(); ++i) {
- ASSERT_EQ(values[i], varr->Value(i));
- }
-}
+ ASSERT_OK(ValidateArray(*result_));
+ ASSERT_EQ(result_->length(), 3);
+ ASSERT_EQ(result_->null_count(), 3);
+ ASSERT_TRUE(result_->IsNull(0));
+ ASSERT_TRUE(result_->IsNull(1));
+ ASSERT_TRUE(result_->IsNull(2));
-TEST_F(TestListArray, TestBasics) {
- std::vector<int32_t> values = {0, 1, 2, 3, 4, 5, 6};
- std::vector<int> lengths = {3, 0, 4};
- std::vector<uint8_t> is_valid = {1, 0, 1};
+ ASSERT_EQ(0, result_->raw_value_offsets()[0]);
+ ASSERT_EQ(0, result_->value_offset(1));
+ ASSERT_EQ(0, result_->value_offset(2));
+ ASSERT_EQ(0, result_->value_offset(3));
- Int32Builder* vb = checked_cast<Int32Builder*>(builder_->value_builder());
+ auto values = result_->values();
+ ASSERT_EQ(0, values->length());
+ // Values buffer should be non-null
+ ASSERT_NE(nullptr, values->data()->buffers[1]);
+ }
- ASSERT_OK(builder_->Reserve(lengths.size()));
- ASSERT_OK(vb->Reserve(values.size()));
+ void TestBulkAppend() {
+ std::vector<int16_t> values = {0, 1, 2, 3, 4, 5, 6};
+ std::vector<uint8_t> is_valid = {1, 0, 1};
+ std::vector<offset_type> offsets = {0, 3, 3};
- int pos = 0;
- for (size_t i = 0; i < lengths.size(); ++i) {
- ASSERT_OK(builder_->Append(is_valid[i] > 0));
- for (int j = 0; j < lengths[i]; ++j) {
- ASSERT_OK(vb->Append(values[pos++]));
+ Int16Builder* vb = checked_cast<Int16Builder*>(builder_->value_builder());
+ ASSERT_OK(vb->Reserve(values.size()));
+
+ ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size(), is_valid.data()));
+ for (int16_t value : values) {
+ ASSERT_OK(vb->Append(value));
}
+ Done();
+ ValidateBasicListArray(result_.get(), values, is_valid);
}
- Done();
- ValidateBasicListArray(result_.get(), values, is_valid);
-}
+ void TestBulkAppendInvalid() {
+ std::vector<int16_t> values = {0, 1, 2, 3, 4, 5, 6};
+ std::vector<int> lengths = {3, 0, 4};
+ std::vector<uint8_t> is_valid = {1, 0, 1};
+ // Should be {0, 3, 3} given the is_valid array
+ std::vector<offset_type> offsets = {0, 2, 4};
-TEST_F(TestListArray, BulkAppend) {
- std::vector<int32_t> values = {0, 1, 2, 3, 4, 5, 6};
- std::vector<int> lengths = {3, 0, 4};
- std::vector<uint8_t> is_valid = {1, 0, 1};
- std::vector<int32_t> offsets = {0, 3, 3};
+ Int16Builder* vb = checked_cast<Int16Builder*>(builder_->value_builder());
+ ASSERT_OK(vb->Reserve(values.size()));
- Int32Builder* vb = checked_cast<Int32Builder*>(builder_->value_builder());
- ASSERT_OK(vb->Reserve(values.size()));
+ ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size(), is_valid.data()));
+ ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size(), is_valid.data()));
+ for (int16_t value : values) {
+ ASSERT_OK(vb->Append(value));
+ }
- ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size(), is_valid.data()));
- for (int32_t value : values) {
- ASSERT_OK(vb->Append(value));
+ Done();
+ ASSERT_RAISES(Invalid, ValidateArray(*result_));
}
- Done();
- ValidateBasicListArray(result_.get(), values, is_valid);
-}
-TEST_F(TestListArray, BulkAppendInvalid) {
- std::vector<int32_t> values = {0, 1, 2, 3, 4, 5, 6};
- std::vector<int> lengths = {3, 0, 4};
- std::vector<uint8_t> is_null = {0, 1, 0};
- std::vector<uint8_t> is_valid = {1, 0, 1};
- std::vector<int32_t> offsets = {0, 2, 4}; // should be 0, 3, 3 given the is_null array
+ void TestZeroLength() {
+ // All buffers are null
+ Done();
+ ASSERT_OK(ValidateArray(*result_));
+ }
- Int32Builder* vb = checked_cast<Int32Builder*>(builder_->value_builder());
- ASSERT_OK(vb->Reserve(values.size()));
+ void TestBuilderPreserveFieldName() {
+ auto list_type_with_name = std::make_shared<T>(field("counts", int16()));
- ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size(), is_valid.data()));
- ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size(), is_valid.data()));
- for (int32_t value : values) {
- ASSERT_OK(vb->Append(value));
+ std::unique_ptr<ArrayBuilder> tmp;
+ ASSERT_OK(MakeBuilder(pool_, list_type_with_name, &tmp));
+ builder_.reset(checked_cast<BuilderType*>(tmp.release()));
+
+ std::vector<offset_type> offsets = {1, 2, 4, 8};
+ ASSERT_OK(builder_->AppendValues(offsets.data(), offsets.size()));
+
+ std::shared_ptr<Array> list_array;
+ ASSERT_OK(builder_->Finish(&list_array));
+
+ const auto& type = checked_cast<T&>(*list_array->type());
+ ASSERT_EQ("counts", type.value_field()->name());
}
- Done();
- ASSERT_RAISES(Invalid, ValidateArray(*result_));
-}
+ protected:
+ std::shared_ptr<DataType> value_type_;
-TEST_F(TestListArray, TestZeroLength) {
- // All buffers are null
- Done();
- ASSERT_OK(ValidateArray(*result_));
-}
+ std::shared_ptr<BuilderType> builder_;
+ std::shared_ptr<ArrayType> result_;
+};
-TEST_F(TestListArray, TestBuilderPreserveFieleName) {
- auto list_type_with_name = list(field("counts", int32()));
+TYPED_TEST_CASE(TestListArray, ListTypes);
- std::unique_ptr<ArrayBuilder> tmp;
- ASSERT_OK(MakeBuilder(pool_, list_type_with_name, &tmp));
- builder_.reset(checked_cast<ListBuilder*>(tmp.release()));
+TYPED_TEST(TestListArray, Basics) { this->TestBasics(); }
- std::vector<int32_t> values = {1, 2, 4, 8};
- ASSERT_OK(builder_->AppendValues(values.data(), values.size()));
+TYPED_TEST(TestListArray, Equality) { this->TestEquality(); }
- std::shared_ptr<Array> list_array;
- ASSERT_OK(builder_->Finish(&list_array));
+TYPED_TEST(TestListArray, ValuesEquality) { this->TestValuesEquality(); }
- const auto& type = checked_cast<ListType&>(*list_array->type());
- ASSERT_EQ("counts", type.value_field()->name());
+TYPED_TEST(TestListArray, FromArrays) { this->TestFromArrays(); }
+
+TYPED_TEST(TestListArray, AppendNull) { this->TestAppendNull(); }
+
+TYPED_TEST(TestListArray, AppendNulls) { this->TestAppendNulls(); }
+
+TYPED_TEST(TestListArray, BulkAppend) { this->TestBulkAppend(); }
+
+TYPED_TEST(TestListArray, BulkAppendInvalid) { this->TestBulkAppendInvalid(); }
+
+TYPED_TEST(TestListArray, ZeroLength) { this->TestZeroLength(); }
+
+TYPED_TEST(TestListArray, BuilderPreserveFieldName) {
+ this->TestBuilderPreserveFieldName();
}
// ----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc
index 0b7d8f1..01f0ddb 100644
--- a/cpp/src/arrow/array.cc
+++ b/cpp/src/arrow/array.cc
@@ -22,6 +22,7 @@
#include <cstdint>
#include <limits>
#include <sstream>
+#include <type_traits>
#include <utility>
#include "arrow/buffer.h"
@@ -199,34 +200,29 @@ BooleanArray::BooleanArray(int64_t length, const std::shared_ptr<Buffer>& data,
: PrimitiveArray(boolean(), length, data, null_bitmap, null_count, offset) {}
// ----------------------------------------------------------------------
-// ListArray
+// ListArray / LargeListArray
-ListArray::ListArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
+namespace {
-ListArray::ListArray(const std::shared_ptr<DataType>& type, int64_t length,
- const std::shared_ptr<Buffer>& value_offsets,
- const std::shared_ptr<Array>& values,
- const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
- int64_t offset) {
- auto internal_data =
- ArrayData::Make(type, length, {null_bitmap, value_offsets}, null_count, offset);
- internal_data->child_data.emplace_back(values->data());
- SetData(internal_data);
-}
+template <typename TYPE>
+Status ListArrayFromArrays(const Array& offsets, const Array& values, MemoryPool* pool,
+ std::shared_ptr<Array>* out) {
+ using offset_type = typename TYPE::offset_type;
+ using ArrayType = typename TypeTraits<TYPE>::ArrayType;
+ using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType;
+ using OffsetArrayType = typename TypeTraits<OffsetArrowType>::ArrayType;
-Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPool* pool,
- std::shared_ptr<Array>* out) {
if (offsets.length() == 0) {
return Status::Invalid("List offsets must have non-zero length");
}
- if (offsets.type_id() != Type::INT32) {
- return Status::TypeError("List offsets must be signed int32");
+ if (offsets.type_id() != OffsetArrowType::type_id) {
+ return Status::TypeError("List offsets must be ", OffsetArrowType::type_name());
}
BufferVector buffers = {};
- const auto& typed_offsets = checked_cast<const Int32Array&>(offsets);
+ const auto& typed_offsets = checked_cast<const OffsetArrayType&>(offsets);
const int64_t num_offsets = offsets.length();
@@ -236,7 +232,8 @@ Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPo
}
std::shared_ptr<Buffer> clean_offsets, clean_valid_bits;
- RETURN_NOT_OK(AllocateBuffer(pool, num_offsets * sizeof(int32_t), &clean_offsets));
+ RETURN_NOT_OK(
+ AllocateBuffer(pool, num_offsets * sizeof(offset_type), &clean_offsets));
// Copy valid bits, zero out the bit for the final offset
// XXX why?
@@ -245,11 +242,12 @@ Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPo
BitUtil::ClearBit(clean_valid_bits->mutable_data(), num_offsets);
buffers.emplace_back(std::move(clean_valid_bits));
- const int32_t* raw_offsets = typed_offsets.raw_values();
- auto clean_raw_offsets = reinterpret_cast<int32_t*>(clean_offsets->mutable_data());
+ const offset_type* raw_offsets = typed_offsets.raw_values();
+ auto clean_raw_offsets =
+ reinterpret_cast<offset_type*>(clean_offsets->mutable_data());
// Must work backwards so we can tell how many values were in the last non-null value
- int32_t current_offset = raw_offsets[num_offsets - 1];
+ offset_type current_offset = raw_offsets[num_offsets - 1];
for (int64_t i = num_offsets - 1; i >= 0; --i) {
if (offsets.IsValid(i)) {
current_offset = raw_offsets[i];
@@ -263,25 +261,55 @@ Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPo
buffers.emplace_back(typed_offsets.values());
}
- auto list_type = list(values.type());
+ auto list_type = std::make_shared<TYPE>(values.type());
auto internal_data = ArrayData::Make(list_type, num_offsets - 1, std::move(buffers),
offsets.null_count(), offsets.offset());
internal_data->child_data.push_back(values.data());
- *out = std::make_shared<ListArray>(internal_data);
+ *out = std::make_shared<ArrayType>(internal_data);
return Status::OK();
}
+} // namespace
+
+ListArray::ListArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
+
+LargeListArray::LargeListArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
+
+ListArray::ListArray(const std::shared_ptr<DataType>& type, int64_t length,
+ const std::shared_ptr<Buffer>& value_offsets,
+ const std::shared_ptr<Array>& values,
+ const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
+ int64_t offset) {
+ ARROW_CHECK_EQ(type->id(), Type::LIST);
+ auto internal_data =
+ ArrayData::Make(type, length, {null_bitmap, value_offsets}, null_count, offset);
+ internal_data->child_data.emplace_back(values->data());
+ SetData(internal_data);
+}
+
+LargeListArray::LargeListArray(const std::shared_ptr<DataType>& type, int64_t length,
+ const std::shared_ptr<Buffer>& value_offsets,
+ const std::shared_ptr<Array>& values,
+ const std::shared_ptr<Buffer>& null_bitmap,
+ int64_t null_count, int64_t offset) {
+ ARROW_CHECK_EQ(type->id(), Type::LARGE_LIST);
+ auto internal_data =
+ ArrayData::Make(type, length, {null_bitmap, value_offsets}, null_count, offset);
+ internal_data->child_data.emplace_back(values->data());
+ SetData(internal_data);
+}
+
void ListArray::SetData(const std::shared_ptr<ArrayData>& data) {
this->Array::SetData(data);
ARROW_CHECK_EQ(data->buffers.size(), 2);
- ARROW_CHECK(data->type->id() == Type::LIST);
+ ARROW_CHECK_EQ(data->type->id(), Type::LIST);
list_type_ = checked_cast<const ListType*>(data->type.get());
auto value_offsets = data->buffers[1];
raw_value_offsets_ = value_offsets == nullptr
? nullptr
- : reinterpret_cast<const int32_t*>(value_offsets->data());
+ : reinterpret_cast<const offset_type*>(value_offsets->data());
ARROW_CHECK_EQ(data_->child_data.size(), 1);
ARROW_CHECK_EQ(list_type_->value_type()->id(), data->child_data[0]->type->id());
@@ -289,11 +317,32 @@ void ListArray::SetData(const std::shared_ptr<ArrayData>& data) {
values_ = MakeArray(data_->child_data[0]);
}
-std::shared_ptr<DataType> ListArray::value_type() const {
- return list_type()->value_type();
+void LargeListArray::SetData(const std::shared_ptr<ArrayData>& data) {
+ this->Array::SetData(data);
+ ARROW_CHECK_EQ(data->buffers.size(), 2);
+ ARROW_CHECK_EQ(data->type->id(), Type::LARGE_LIST);
+ list_type_ = checked_cast<const LargeListType*>(data->type.get());
+
+ auto value_offsets = data->buffers[1];
+ raw_value_offsets_ = value_offsets == nullptr
+ ? nullptr
+ : reinterpret_cast<const offset_type*>(value_offsets->data());
+
+ ARROW_CHECK_EQ(data_->child_data.size(), 1);
+ ARROW_CHECK_EQ(list_type_->value_type()->id(), data->child_data[0]->type->id());
+ DCHECK(list_type_->value_type()->Equals(data->child_data[0]->type));
+ values_ = MakeArray(data_->child_data[0]);
}
-std::shared_ptr<Array> ListArray::values() const { return values_; }
+Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPool* pool,
+ std::shared_ptr<Array>* out) {
+ return ListArrayFromArrays<ListType>(offsets, values, pool, out);
+}
+
+Status LargeListArray::FromArrays(const Array& offsets, const Array& values,
+ MemoryPool* pool, std::shared_ptr<Array>* out) {
+ return ListArrayFromArrays<LargeListType>(offsets, values, pool, out);
+}
// ----------------------------------------------------------------------
// MapArray
@@ -1167,21 +1216,12 @@ struct ValidateVisitor {
}
Status Visit(const ListArray& array) {
- if (!array.values()) {
- return Status::Invalid("values was null");
- }
-
- const int32_t last_offset = array.value_offset(array.length());
- if (array.values()->length() != last_offset) {
- return Status::Invalid("Final offset invariant not equal to values length: ",
- last_offset, "!=", array.values()->length());
- }
-
- const Status child_valid = ValidateArray(*array.values());
- if (!child_valid.ok()) {
- return Status::Invalid("Child array invalid: ", child_valid.ToString());
- }
+ RETURN_NOT_OK(ValidateListArray(array));
+ return ValidateOffsets(array);
+ }
+ Status Visit(const LargeListArray& array) {
+ RETURN_NOT_OK(ValidateListArray(array));
return ValidateOffsets(array);
}
@@ -1280,6 +1320,26 @@ struct ValidateVisitor {
}
protected:
+ template <typename ListArrayType>
+ Status ValidateListArray(const ListArrayType& array) {
+ if (!array.values()) {
+ return Status::Invalid("values was null");
+ }
+
+ const auto last_offset = array.value_offset(array.length());
+ if (array.values()->length() != last_offset) {
+ return Status::Invalid("Final offset invariant not equal to values length: ",
+ last_offset, "!=", array.values()->length());
+ }
+
+ const Status child_valid = ValidateArray(*array.values());
+ if (!child_valid.ok()) {
+ return Status::Invalid("Child array invalid: ", child_valid.ToString());
+ }
+
+ return ValidateOffsets(array);
+ }
+
template <typename ArrayType>
Status ValidateOffsets(ArrayType& array) {
using offset_type = typename ArrayType::offset_type;
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index e13088c..2313994 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -488,12 +488,49 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray {
// ----------------------------------------------------------------------
// ListArray
-/// Concrete Array class for list data
-class ARROW_EXPORT ListArray : public Array {
+/// Base class for variable-sized list arrays, regardless of offset size.
+template <typename TYPE>
+class BaseListArray : public Array {
public:
- using TypeClass = ListType;
- using offset_type = ListType::offset_type;
+ using TypeClass = TYPE;
+ using offset_type = typename TypeClass::offset_type;
+
+ const TypeClass* list_type() const { return list_type_; }
+
+ /// \brief Return array object containing the list's values
+ std::shared_ptr<Array> values() const { return values_; }
+
+ /// Note that this buffer does not account for any slice offset
+ std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }
+ std::shared_ptr<DataType> value_type() const { return list_type_->value_type(); }
+
+ /// Return pointer to raw value offsets accounting for any slice offset
+ const offset_type* raw_value_offsets() const {
+ return raw_value_offsets_ + data_->offset;
+ }
+
+ // The following functions will not perform boundschecking
+ offset_type value_offset(int64_t i) const {
+ return raw_value_offsets_[i + data_->offset];
+ }
+ offset_type value_length(int64_t i) const {
+ i += data_->offset;
+ return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
+ }
+ std::shared_ptr<Array> value_slice(int64_t i) const {
+ return values_->Slice(value_offset(i), value_length(i));
+ }
+
+ protected:
+ const TypeClass* list_type_ = NULLPTR;
+ std::shared_ptr<Array> values_;
+ const offset_type* raw_value_offsets_ = NULLPTR;
+};
+
+/// Concrete Array class for list data
+class ARROW_EXPORT ListArray : public BaseListArray<ListType> {
+ public:
explicit ListArray(const std::shared_ptr<ArrayData>& data);
ListArray(const std::shared_ptr<DataType>& type, int64_t length,
@@ -511,46 +548,48 @@ class ARROW_EXPORT ListArray : public Array {
///
/// \param[in] offsets Array containing n + 1 offsets encoding length and
/// size. Must be of int32 type
- /// \param[in] values Array containing
+ /// \param[in] values Array containing list values
/// \param[in] pool MemoryPool in case new offsets array needs to be
/// allocated because of null values
/// \param[out] out Will have length equal to offsets.length() - 1
static Status FromArrays(const Array& offsets, const Array& values, MemoryPool* pool,
std::shared_ptr<Array>* out);
- const ListType* list_type() const { return list_type_; }
-
- /// \brief Return array object containing the list's values
- std::shared_ptr<Array> values() const;
-
- /// Note that this buffer does not account for any slice offset
- std::shared_ptr<Buffer> value_offsets() const { return data_->buffers[1]; }
-
- std::shared_ptr<DataType> value_type() const;
-
- /// Return pointer to raw value offsets accounting for any slice offset
- const int32_t* raw_value_offsets() const { return raw_value_offsets_ + data_->offset; }
-
- // The following functions will not perform boundschecking
- int32_t value_offset(int64_t i) const { return raw_value_offsets_[i + data_->offset]; }
- int32_t value_length(int64_t i) const {
- i += data_->offset;
- return raw_value_offsets_[i + 1] - raw_value_offsets_[i];
- }
- std::shared_ptr<Array> value_slice(int64_t i) const {
- return values_->Slice(value_offset(i), value_length(i));
- }
-
protected:
// This constructor defers SetData to a derived array class
ListArray() = default;
void SetData(const std::shared_ptr<ArrayData>& data);
+};
- const int32_t* raw_value_offsets_ = NULLPTR;
+/// Concrete Array class for large list data (with 64-bit offsets)
+class ARROW_EXPORT LargeListArray : public BaseListArray<LargeListType> {
+ public:
+ explicit LargeListArray(const std::shared_ptr<ArrayData>& data);
- private:
- const ListType* list_type_ = NULLPTR;
- std::shared_ptr<Array> values_;
+ LargeListArray(const std::shared_ptr<DataType>& type, int64_t length,
+ const std::shared_ptr<Buffer>& value_offsets,
+ const std::shared_ptr<Array>& values,
+ const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
+ int64_t null_count = kUnknownNullCount, int64_t offset = 0);
+
+ /// \brief Construct LargeListArray from array of offsets and child value array
+ ///
+ /// This function does the bare minimum of validation of the offsets and
+ /// input types, and will allocate a new offsets array if necessary (i.e. if
+ /// the offsets contain any nulls). If the offsets do not have nulls, they
+ /// are assumed to be well-formed
+ ///
+ /// \param[in] offsets Array containing n + 1 offsets encoding length and
+ /// size. Must be of int64 type
+ /// \param[in] values Array containing list values
+ /// \param[in] pool MemoryPool in case new offsets array needs to be
+ /// allocated because of null values
+ /// \param[out] out Will have length equal to offsets.length() - 1
+ static Status FromArrays(const Array& offsets, const Array& values, MemoryPool* pool,
+ std::shared_ptr<Array>* out);
+
+ protected:
+ void SetData(const std::shared_ptr<ArrayData>& data);
};
// ----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array/builder_nested.cc b/cpp/src/arrow/array/builder_nested.cc
index 30b3fc0..809cf96 100644
--- a/cpp/src/arrow/array/builder_nested.cc
+++ b/cpp/src/arrow/array/builder_nested.cc
@@ -24,7 +24,6 @@
#include <utility>
#include <vector>
-#include "arrow/array.h"
#include "arrow/buffer.h"
#include "arrow/status.h"
#include "arrow/type.h"
@@ -36,115 +35,6 @@
namespace arrow {
// ----------------------------------------------------------------------
-// ListBuilder
-
-ListBuilder::ListBuilder(MemoryPool* pool,
- std::shared_ptr<ArrayBuilder> const& value_builder,
- const std::shared_ptr<DataType>& type)
- : ArrayBuilder(type ? type
- : std::static_pointer_cast<DataType>(
- std::make_shared<ListType>(value_builder->type())),
- pool),
- offsets_builder_(pool),
- value_builder_(value_builder) {}
-
-Status ListBuilder::AppendValues(const int32_t* offsets, int64_t length,
- const uint8_t* valid_bytes) {
- RETURN_NOT_OK(Reserve(length));
- UnsafeAppendToBitmap(valid_bytes, length);
- offsets_builder_.UnsafeAppend(offsets, length);
- return Status::OK();
-}
-
-Status ListBuilder::CheckNextOffset() const {
- const int64_t num_values = value_builder_->length();
- ARROW_RETURN_IF(
- num_values > kListMaximumElements,
- Status::CapacityError("ListArray cannot contain more than 2^31 - 1 child elements,",
- " have ", num_values));
- return Status::OK();
-}
-
-Status ListBuilder::AppendNextOffset() {
- RETURN_NOT_OK(CheckNextOffset());
- const int64_t num_values = value_builder_->length();
- return offsets_builder_.Append(static_cast<int32_t>(num_values));
-}
-
-Status ListBuilder::Append(bool is_valid) {
- RETURN_NOT_OK(Reserve(1));
- UnsafeAppendToBitmap(is_valid);
- return AppendNextOffset();
-}
-
-Status ListBuilder::AppendNulls(int64_t length) {
- RETURN_NOT_OK(Reserve(length));
- RETURN_NOT_OK(CheckNextOffset());
- UnsafeAppendToBitmap(length, false);
- const int64_t num_values = value_builder_->length();
- for (int64_t i = 0; i < length; ++i) {
- offsets_builder_.UnsafeAppend(static_cast<int32_t>(num_values));
- }
- return Status::OK();
-}
-
-Status ListBuilder::Resize(int64_t capacity) {
- if (capacity > kListMaximumElements) {
- return Status::CapacityError(
- "ListArray cannot reserve space for more then 2^31 - 1 child elements, got ",
- capacity);
- }
- RETURN_NOT_OK(CheckCapacity(capacity, capacity_));
-
- // one more then requested for offsets
- RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
- return ArrayBuilder::Resize(capacity);
-}
-
-Status ListBuilder::FinishInternal(std::shared_ptr<ArrayData>* out) {
- RETURN_NOT_OK(AppendNextOffset());
-
- // Offset padding zeroed by BufferBuilder
- std::shared_ptr<Buffer> offsets;
- RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
-
- std::shared_ptr<ArrayData> items;
- if (values_) {
- items = values_->data();
- } else {
- if (value_builder_->length() == 0) {
- // Try to make sure we get a non-null values buffer (ARROW-2744)
- RETURN_NOT_OK(value_builder_->Resize(0));
- }
- RETURN_NOT_OK(value_builder_->FinishInternal(&items));
- }
-
- // If the type has not been specified in the constructor, infer it
- // This is the case if the value_builder contains a DenseUnionBuilder
- if (!arrow::internal::checked_cast<ListType&>(*type_).value_type()) {
- type_ = std::static_pointer_cast<DataType>(
- std::make_shared<ListType>(value_builder_->type()));
- }
- std::shared_ptr<Buffer> null_bitmap;
- RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
- *out = ArrayData::Make(type_, length_, {null_bitmap, offsets}, null_count_);
- (*out)->child_data.emplace_back(std::move(items));
- Reset();
- return Status::OK();
-}
-
-void ListBuilder::Reset() {
- ArrayBuilder::Reset();
- values_.reset();
- offsets_builder_.Reset();
- value_builder_->Reset();
-}
-
-ArrayBuilder* ListBuilder::value_builder() const {
- DCHECK(!values_) << "Using value builder is pointless when values_ is set";
- return value_builder_.get();
-}
-// ----------------------------------------------------------------------
// MapBuilder
MapBuilder::MapBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& key_builder,
diff --git a/cpp/src/arrow/array/builder_nested.h b/cpp/src/arrow/array/builder_nested.h
index 8742f2b..9b5b4de 100644
--- a/cpp/src/arrow/array/builder_nested.h
+++ b/cpp/src/arrow/array/builder_nested.h
@@ -17,9 +17,12 @@
#pragma once
+#include <limits>
#include <memory>
+#include <utility>
#include <vector>
+#include "arrow/array.h"
#include "arrow/array/builder_base.h"
#include "arrow/buffer-builder.h"
@@ -28,63 +31,174 @@ namespace arrow {
// ----------------------------------------------------------------------
// List builder
-/// \class ListBuilder
-/// \brief Builder class for variable-length list array value types
-///
-/// To use this class, you must append values to the child array builder and use
-/// the Append function to delimit each distinct list value (once the values
-/// have been appended to the child array) or use the bulk API to append
-/// a sequence of offests and null values.
-///
-/// A note on types. Per arrow/type.h all types in the c++ implementation are
-/// logical so even though this class always builds list array, this can
-/// represent multiple different logical types. If no logical type is provided
-/// at construction time, the class defaults to List<T> where t is taken from the
-/// value_builder/values that the object is constructed with.
-class ARROW_EXPORT ListBuilder : public ArrayBuilder {
+template <typename TYPE>
+class BaseListBuilder : public ArrayBuilder {
public:
+ using TypeClass = TYPE;
+ using offset_type = typename TypeClass::offset_type;
+
/// Use this constructor to incrementally build the value array along with offsets and
/// null bitmap.
- ListBuilder(MemoryPool* pool, const std::shared_ptr<ArrayBuilder>& value_builder,
- const std::shared_ptr<DataType>& type = NULLPTR);
-
- Status Resize(int64_t capacity) override;
- void Reset() override;
- Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
-
- /// \cond FALSE
- using ArrayBuilder::Finish;
- /// \endcond
+ BaseListBuilder(MemoryPool* pool, std::shared_ptr<ArrayBuilder> const& value_builder,
+ const std::shared_ptr<DataType>& type = NULLPTR)
+ : ArrayBuilder(type ? type
+ : std::static_pointer_cast<DataType>(
+ std::make_shared<TypeClass>(value_builder->type())),
+ pool),
+ offsets_builder_(pool),
+ value_builder_(value_builder) {}
+
+ Status Resize(int64_t capacity) override {
+ if (capacity > maximum_elements()) {
+ return Status::CapacityError("List array cannot reserve space for more than ",
+ maximum_elements(), " got ", capacity);
+ }
+ ARROW_RETURN_NOT_OK(CheckCapacity(capacity, capacity_));
+
+ // one more then requested for offsets
+ ARROW_RETURN_NOT_OK(offsets_builder_.Resize(capacity + 1));
+ return ArrayBuilder::Resize(capacity);
+ }
- Status Finish(std::shared_ptr<ListArray>* out) { return FinishTyped(out); }
+ void Reset() override {
+ ArrayBuilder::Reset();
+ values_.reset();
+ offsets_builder_.Reset();
+ value_builder_->Reset();
+ }
/// \brief Vector append
///
/// If passed, valid_bytes is of equal length to values, and any zero byte
/// will be considered as a null for that slot
- Status AppendValues(const int32_t* offsets, int64_t length,
- const uint8_t* valid_bytes = NULLPTR);
+ Status AppendValues(const offset_type* offsets, int64_t length,
+ const uint8_t* valid_bytes = NULLPTR) {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ UnsafeAppendToBitmap(valid_bytes, length);
+ offsets_builder_.UnsafeAppend(offsets, length);
+ return Status::OK();
+ }
/// \brief Start a new variable-length list slot
///
/// This function should be called before beginning to append elements to the
/// value builder
- Status Append(bool is_valid = true);
+ Status Append(bool is_valid = true) {
+ ARROW_RETURN_NOT_OK(Reserve(1));
+ UnsafeAppendToBitmap(is_valid);
+ return AppendNextOffset();
+ }
Status AppendNull() final { return Append(false); }
- Status AppendNulls(int64_t length) final;
+ Status AppendNulls(int64_t length) final {
+ ARROW_RETURN_NOT_OK(Reserve(length));
+ ARROW_RETURN_NOT_OK(CheckNextOffset());
+ UnsafeAppendToBitmap(length, false);
+ const int64_t num_values = value_builder_->length();
+ for (int64_t i = 0; i < length; ++i) {
+ offsets_builder_.UnsafeAppend(static_cast<offset_type>(num_values));
+ }
+ return Status::OK();
+ }
+
+ Status FinishInternal(std::shared_ptr<ArrayData>* out) override {
+ ARROW_RETURN_NOT_OK(AppendNextOffset());
+
+ // Offset padding zeroed by BufferBuilder
+ std::shared_ptr<Buffer> offsets;
+ ARROW_RETURN_NOT_OK(offsets_builder_.Finish(&offsets));
+
+ std::shared_ptr<ArrayData> items;
+ if (values_) {
+ items = values_->data();
+ } else {
+ if (value_builder_->length() == 0) {
+ // Try to make sure we get a non-null values buffer (ARROW-2744)
+ ARROW_RETURN_NOT_OK(value_builder_->Resize(0));
+ }
+ ARROW_RETURN_NOT_OK(value_builder_->FinishInternal(&items));
+ }
+
+ // If the type has not been specified in the constructor, infer it
+ // This is the case if the value_builder contains a DenseUnionBuilder
+ if (!arrow::internal::checked_cast<TypeClass&>(*type_).value_type()) {
+ type_ = std::static_pointer_cast<DataType>(
+ std::make_shared<TypeClass>(value_builder_->type()));
+ }
+ std::shared_ptr<Buffer> null_bitmap;
+ ARROW_RETURN_NOT_OK(null_bitmap_builder_.Finish(&null_bitmap));
+ *out = ArrayData::Make(type_, length_, {null_bitmap, offsets}, null_count_);
+ (*out)->child_data.emplace_back(std::move(items));
+ Reset();
+ return Status::OK();
+ }
- ArrayBuilder* value_builder() const;
+ ArrayBuilder* value_builder() const { return value_builder_.get(); }
+
+ // Cannot make this a static attribute because of linking issues
+ static constexpr int64_t maximum_elements() {
+ return std::numeric_limits<offset_type>::max() - 1;
+ }
protected:
- TypedBufferBuilder<int32_t> offsets_builder_;
+ TypedBufferBuilder<offset_type> offsets_builder_;
std::shared_ptr<ArrayBuilder> value_builder_;
std::shared_ptr<Array> values_;
- Status CheckNextOffset() const;
- Status AppendNextOffset();
- Status AppendNextOffset(int64_t num_repeats);
+ Status CheckNextOffset() const {
+ const int64_t num_values = value_builder_->length();
+ ARROW_RETURN_IF(
+ num_values > maximum_elements(),
+ Status::CapacityError("List array cannot contain more than ", maximum_elements(),
+ " child elements,", " have ", num_values));
+ return Status::OK();
+ }
+
+ Status AppendNextOffset() {
+ ARROW_RETURN_NOT_OK(CheckNextOffset());
+ const int64_t num_values = value_builder_->length();
+ return offsets_builder_.Append(static_cast<offset_type>(num_values));
+ }
+};
+
+/// \class ListBuilder
+/// \brief Builder class for variable-length list array value types
+///
+/// To use this class, you must append values to the child array builder and use
+/// the Append function to delimit each distinct list value (once the values
+/// have been appended to the child array) or use the bulk API to append
+/// a sequence of offests and null values.
+///
+/// A note on types. Per arrow/type.h all types in the c++ implementation are
+/// logical so even though this class always builds list array, this can
+/// represent multiple different logical types. If no logical type is provided
+/// at construction time, the class defaults to List<T> where t is taken from the
+/// value_builder/values that the object is constructed with.
+class ARROW_EXPORT ListBuilder : public BaseListBuilder<ListType> {
+ public:
+ using BaseListBuilder::BaseListBuilder;
+
+ /// \cond FALSE
+ using ArrayBuilder::Finish;
+ /// \endcond
+
+ Status Finish(std::shared_ptr<ListArray>* out) { return FinishTyped(out); }
+};
+
+/// \class LargeListBuilder
+/// \brief Builder class for large variable-length list array value types
+///
+/// Like ListBuilder, but to create large list arrays (with 64-bit offsets).
+class ARROW_EXPORT LargeListBuilder : public BaseListBuilder<LargeListType> {
+ public:
+ using BaseListBuilder::BaseListBuilder;
+
+ /// \cond FALSE
+ using ArrayBuilder::Finish;
+ /// \endcond
+
+ Status Finish(std::shared_ptr<LargeListArray>* out) { return FinishTyped(out); }
};
// ----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array/concatenate-test.cc b/cpp/src/arrow/array/concatenate-test.cc
index 730b25a..4d16b17 100644
--- a/cpp/src/arrow/array/concatenate-test.cc
+++ b/cpp/src/arrow/array/concatenate-test.cc
@@ -165,11 +165,28 @@ TEST_F(ConcatenateTest, ListType) {
auto values_size = size * 4;
auto values = this->GeneratePrimitive<Int8Type>(values_size, null_probability);
auto offsets_vector = this->Offsets<int32_t>(values_size, size);
- // ensure the first offset is 0, which is expected for ListType
- offsets_vector[0] = 0;
+ // Ensure first and last offsets encompass the whole values array
+ offsets_vector.front() = 0;
+ offsets_vector.back() = static_cast<int32_t>(values_size);
std::shared_ptr<Array> offsets;
ArrayFromVector<Int32Type>(offsets_vector, &offsets);
ASSERT_OK(ListArray::FromArrays(*offsets, *values, default_memory_pool(), out));
+ ASSERT_OK(ValidateArray(**out));
+ });
+}
+
+TEST_F(ConcatenateTest, LargeListType) {
+ Check([this](int32_t size, double null_probability, std::shared_ptr<Array>* out) {
+ auto values_size = size * 4;
+ auto values = this->GeneratePrimitive<Int8Type>(values_size, null_probability);
+ auto offsets_vector = this->Offsets<int64_t>(values_size, size);
+ // Ensure first and last offsets encompass the whole values array
+ offsets_vector.front() = 0;
+ offsets_vector.back() = static_cast<int64_t>(values_size);
+ std::shared_ptr<Array> offsets;
+ ArrayFromVector<Int64Type>(offsets_vector, &offsets);
+ ASSERT_OK(LargeListArray::FromArrays(*offsets, *values, default_memory_pool(), out));
+ ASSERT_OK(ValidateArray(**out));
});
}
diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc
index a20b157..4428e4b 100644
--- a/cpp/src/arrow/array/concatenate.cc
+++ b/cpp/src/arrow/array/concatenate.cc
@@ -204,6 +204,14 @@ class ConcatenateImpl {
.Concatenate(out_.child_data[0].get());
}
+ Status Visit(const LargeListType&) {
+ std::vector<Range> value_ranges;
+ RETURN_NOT_OK(ConcatenateOffsets<int64_t>(Buffers(1, sizeof(int64_t)), pool_,
+ &out_.buffers[1], &value_ranges));
+ return ConcatenateImpl(ChildData(0, value_ranges), pool_)
+ .Concatenate(out_.child_data[0].get());
+ }
+
Status Visit(const FixedSizeListType&) {
return ConcatenateImpl(ChildData(0), pool_).Concatenate(out_.child_data[0].get());
}
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 44b0d04..b13ce20 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -136,6 +136,14 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
out->reset(new ListBuilder(pool, std::move(value_builder), type));
return Status::OK();
}
+ case Type::LARGE_LIST: {
+ std::unique_ptr<ArrayBuilder> value_builder;
+ std::shared_ptr<DataType> value_type =
+ internal::checked_cast<const LargeListType&>(*type).value_type();
+ RETURN_NOT_OK(MakeBuilder(pool, value_type, &value_builder));
+ out->reset(new LargeListBuilder(pool, std::move(value_builder), type));
+ return Status::OK();
+ }
case Type::MAP: {
const auto& map_type = internal::checked_cast<const MapType&>(*type);
std::unique_ptr<ArrayBuilder> key_builder, item_builder;
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index cb606e3..ff4c2b5 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -36,6 +36,7 @@
#include "arrow/status.h"
#include "arrow/tensor.h"
#include "arrow/type.h"
+#include "arrow/type_traits.h"
#include "arrow/util/bit-util.h"
#include "arrow/util/checked_cast.h"
#include "arrow/util/logging.h"
@@ -174,8 +175,9 @@ class RangeEqualsVisitor {
return true;
}
- bool CompareLists(const ListArray& left) {
- const auto& right = checked_cast<const ListArray&>(right_);
+ template <typename ListArrayType>
+ bool CompareLists(const ListArrayType& left) {
+ const auto& right = checked_cast<const ListArrayType&>(right_);
const std::shared_ptr<Array>& left_values = left.values();
const std::shared_ptr<Array>& right_values = right.values();
@@ -187,10 +189,10 @@ class RangeEqualsVisitor {
return false;
}
if (is_null) continue;
- const int32_t begin_offset = left.value_offset(i);
- const int32_t end_offset = left.value_offset(i + 1);
- const int32_t right_begin_offset = right.value_offset(o_i);
- const int32_t right_end_offset = right.value_offset(o_i + 1);
+ const auto begin_offset = left.value_offset(i);
+ const auto end_offset = left.value_offset(i + 1);
+ const auto right_begin_offset = right.value_offset(o_i);
+ const auto right_end_offset = right.value_offset(o_i + 1);
// Underlying can't be equal if the size isn't equal
if (end_offset - begin_offset != right_end_offset - right_begin_offset) {
return false;
@@ -339,6 +341,11 @@ class RangeEqualsVisitor {
return Status::OK();
}
+ Status Visit(const LargeListArray& left) {
+ result_ = CompareLists(left);
+ return Status::OK();
+ }
+
Status Visit(const FixedSizeListArray& left) {
const auto& right = checked_cast<const FixedSizeListArray&>(right_);
result_ = left.values()->RangeEquals(
@@ -569,6 +576,20 @@ class ArrayEqualsVisitor : public RangeEqualsVisitor {
}
}
+ template <typename ListArrayType>
+ bool CompareList(const ListArrayType& left) {
+ const auto& right = checked_cast<const ListArrayType&>(right_);
+
+ bool equal_offsets = ValueOffsetsEqual<ListArrayType>(left);
+ if (!equal_offsets) {
+ return false;
+ }
+
+ return left.values()->RangeEquals(left.value_offset(0),
+ left.value_offset(left.length()),
+ right.value_offset(0), right.values());
+ }
+
Status Visit(const BinaryArray& left) {
result_ = CompareBinary(left);
return Status::OK();
@@ -580,16 +601,12 @@ class ArrayEqualsVisitor : public RangeEqualsVisitor {
}
Status Visit(const ListArray& left) {
- const auto& right = checked_cast<const ListArray&>(right_);
- bool equal_offsets = ValueOffsetsEqual<ListArray>(left);
- if (!equal_offsets) {
- result_ = false;
- return Status::OK();
- }
+ result_ = CompareList(left);
+ return Status::OK();
+ }
- result_ =
- left.values()->RangeEquals(left.value_offset(0), left.value_offset(left.length()),
- right.value_offset(0), right.values());
+ Status Visit(const LargeListArray& left) {
+ result_ = CompareList(left);
return Status::OK();
}
@@ -760,6 +777,8 @@ class TypeEqualsVisitor {
Status Visit(const ListType& left) { return VisitChildren(left); }
+ Status Visit(const LargeListType& left) { return VisitChildren(left); }
+
Status Visit(const MapType& left) {
const auto& right = checked_cast<const MapType&>(right_);
if (left.keys_sorted() != right.keys_sorted()) {
@@ -858,6 +877,12 @@ class ScalarEqualsVisitor {
return Status::OK();
}
+ Status Visit(const LargeListScalar& left) {
+ const auto& right = checked_cast<const LargeListScalar&>(right_);
+ result_ = internal::SharedPtrEquals(left.value, right.value);
+ return Status::OK();
+ }
+
Status Visit(const MapScalar& left) {
const auto& right = checked_cast<const MapScalar&>(right_);
result_ = internal::SharedPtrEquals(left.keys, right.keys) &&
diff --git a/cpp/src/arrow/compute/kernels/cast-test.cc b/cpp/src/arrow/compute/kernels/cast-test.cc
index 80538f2..61f1500 100644
--- a/cpp/src/arrow/compute/kernels/cast-test.cc
+++ b/cpp/src/arrow/compute/kernels/cast-test.cc
@@ -1147,6 +1147,34 @@ TEST_F(TestCast, ListToList) {
CheckPass(*float64_list_array, *int64_list_array, int64_list_array->type(), options);
}
+TEST_F(TestCast, LargeListToLargeList) {
+ // Like ListToList above, only testing the basics
+ CastOptions options;
+ std::shared_ptr<Array> offsets;
+
+ std::vector<int64_t> offsets_values = {0, 1, 2, 5, 7, 7, 8, 10};
+ std::vector<bool> offsets_is_valid = {true, true, true, true, false, true, true, true};
+ ArrayFromVector<Int64Type, int64_t>(offsets_is_valid, offsets_values, &offsets);
+
+ std::shared_ptr<Array> int32_plain_array =
+ TestBase::MakeRandomArray<typename TypeTraits<Int32Type>::ArrayType>(10, 2);
+ std::shared_ptr<Array> int32_list_array;
+ ASSERT_OK(
+ LargeListArray::FromArrays(*offsets, *int32_plain_array, pool_, &int32_list_array));
+
+ std::shared_ptr<Array> float64_plain_array;
+ ASSERT_OK(
+ Cast(&this->ctx_, *int32_plain_array, float64(), options, &float64_plain_array));
+ std::shared_ptr<Array> float64_list_array;
+ ASSERT_OK(LargeListArray::FromArrays(*offsets, *float64_plain_array, pool_,
+ &float64_list_array));
+
+ CheckPass(*int32_list_array, *float64_list_array, float64_list_array->type(), options);
+
+ options.allow_float_truncate = true;
+ CheckPass(*float64_list_array, *int32_list_array, int32_list_array->type(), options);
+}
+
TEST_F(TestCast, IdentityCasts) {
// ARROW-4102
auto CheckIdentityCast = [this](std::shared_ptr<DataType> type,
diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc
index a8b6615..839f9a2 100644
--- a/cpp/src/arrow/compute/kernels/cast.cc
+++ b/cpp/src/arrow/compute/kernels/cast.cc
@@ -645,6 +645,7 @@ Status InvokeWithAllocation(FunctionContext* ctx, UnaryKernel* func, const Datum
return Status::OK();
}
+template <typename TypeClass>
class ListCastKernel : public CastKernelBase {
public:
ListCastKernel(std::unique_ptr<UnaryKernel> child_caster,
@@ -655,7 +656,7 @@ class ListCastKernel : public CastKernelBase {
DCHECK_EQ(Datum::ARRAY, input.kind());
const ArrayData& in_data = *input.array();
- DCHECK_EQ(Type::LIST, in_data.type->id());
+ DCHECK_EQ(TypeClass::type_id, in_data.type->id());
ArrayData* result;
if (in_data.offset != 0) {
@@ -1160,19 +1161,20 @@ GET_CAST_FUNCTION(DICTIONARY_CASES, DictionaryType)
namespace {
+template <typename TypeClass>
Status GetListCastFunc(const DataType& in_type, std::shared_ptr<DataType> out_type,
const CastOptions& options, std::unique_ptr<UnaryKernel>* kernel) {
- if (out_type->id() != Type::LIST) {
+ if (out_type->id() != TypeClass::type_id) {
// Kernel will be null
return Status::OK();
}
- const DataType& in_value_type = *checked_cast<const ListType&>(in_type).value_type();
+ const DataType& in_value_type = *checked_cast<const TypeClass&>(in_type).value_type();
std::shared_ptr<DataType> out_value_type =
- checked_cast<const ListType&>(*out_type).value_type();
+ checked_cast<const TypeClass&>(*out_type).value_type();
std::unique_ptr<UnaryKernel> child_caster;
RETURN_NOT_OK(GetCastFunction(in_value_type, out_value_type, options, &child_caster));
*kernel = std::unique_ptr<UnaryKernel>(
- new ListCastKernel(std::move(child_caster), std::move(out_type)));
+ new ListCastKernel<TypeClass>(std::move(child_caster), std::move(out_type)));
return Status::OK();
}
@@ -1238,7 +1240,12 @@ Status GetCastFunction(const DataType& in_type, std::shared_ptr<DataType> out_ty
CAST_FUNCTION_CASE(LargeStringType);
CAST_FUNCTION_CASE(DictionaryType);
case Type::LIST:
- RETURN_NOT_OK(GetListCastFunc(in_type, std::move(out_type), options, kernel));
+ RETURN_NOT_OK(
+ GetListCastFunc<ListType>(in_type, std::move(out_type), options, kernel));
+ break;
+ case Type::LARGE_LIST:
+ RETURN_NOT_OK(
+ GetListCastFunc<LargeListType>(in_type, std::move(out_type), options, kernel));
break;
default:
break;
diff --git a/cpp/src/arrow/compute/kernels/filter-test.cc b/cpp/src/arrow/compute/kernels/filter-test.cc
index 253093e..45fd9e5 100644
--- a/cpp/src/arrow/compute/kernels/filter-test.cc
+++ b/cpp/src/arrow/compute/kernels/filter-test.cc
@@ -358,6 +358,15 @@ TEST_F(TestFilterKernelWithList, FilterListListInt32) {
])");
}
+class TestFilterKernelWithLargeList : public TestFilterKernel<LargeListType> {};
+
+TEST_F(TestFilterKernelWithLargeList, FilterListInt32) {
+ std::string list_json = "[[], [1,2], null, [3]]";
+ this->AssertFilter(large_list(int32()), list_json, "[0, 0, 0, 0]", "[]");
+ this->AssertFilter(large_list(int32()), list_json, "[0, 1, 1, null]",
+ "[[1,2], null, null]");
+}
+
class TestFilterKernelWithFixedSizeList : public TestFilterKernel<FixedSizeListType> {};
TEST_F(TestFilterKernelWithFixedSizeList, FilterFixedSizeListInt32) {
diff --git a/cpp/src/arrow/compute/kernels/take-internal.h b/cpp/src/arrow/compute/kernels/take-internal.h
index 96519a9..04e89d1 100644
--- a/cpp/src/arrow/compute/kernels/take-internal.h
+++ b/cpp/src/arrow/compute/kernels/take-internal.h
@@ -298,20 +298,23 @@ class TakerImpl<IndexSequence, NullType> : public Taker<IndexSequence> {
int64_t length_ = 0;
};
-template <typename IndexSequence>
-class TakerImpl<IndexSequence, ListType> : public Taker<IndexSequence> {
+template <typename IndexSequence, typename TypeClass>
+class ListTakerImpl : public Taker<IndexSequence> {
public:
+ using offset_type = typename TypeClass::offset_type;
+ using ArrayType = typename TypeTraits<TypeClass>::ArrayType;
+
using Taker<IndexSequence>::Taker;
Status Init() override {
- const auto& list_type = checked_cast<const ListType&>(*this->type_);
+ const auto& list_type = checked_cast<const TypeClass&>(*this->type_);
return Taker<RangeIndexSequence>::Make(list_type.value_type(), &value_taker_);
}
Status SetContext(FunctionContext* ctx) override {
auto pool = ctx->memory_pool();
null_bitmap_builder_.reset(new TypedBufferBuilder<bool>(pool));
- offset_builder_.reset(new TypedBufferBuilder<int32_t>(pool));
+ offset_builder_.reset(new TypedBufferBuilder<offset_type>(pool));
RETURN_NOT_OK(offset_builder_->Append(0));
return value_taker_->SetContext(ctx);
}
@@ -319,12 +322,12 @@ class TakerImpl<IndexSequence, ListType> : public Taker<IndexSequence> {
Status Take(const Array& values, IndexSequence indices) override {
DCHECK(this->type_->Equals(values.type()));
- const auto& list_array = checked_cast<const ListArray&>(values);
+ const auto& list_array = checked_cast<const ArrayType&>(values);
RETURN_NOT_OK(null_bitmap_builder_->Reserve(indices.length()));
RETURN_NOT_OK(offset_builder_->Reserve(indices.length()));
- int32_t offset = offset_builder_->data()[offset_builder_->length() - 1];
+ offset_type offset = offset_builder_->data()[offset_builder_->length() - 1];
return VisitIndices(indices, values, [&](int64_t index, bool is_valid) {
null_bitmap_builder_->UnsafeAppend(is_valid);
@@ -340,13 +343,7 @@ class TakerImpl<IndexSequence, ListType> : public Taker<IndexSequence> {
});
}
- Status Finish(std::shared_ptr<Array>* out) override { return FinishAs<ListArray>(out); }
-
- protected:
- // this added method is provided for use by TakerImpl<IndexSequence, MapType>,
- // which needs to construct a MapArray rather than a ListArray
- template <typename T>
- Status FinishAs(std::shared_ptr<Array>* out) {
+ Status Finish(std::shared_ptr<Array>* out) override {
auto null_count = null_bitmap_builder_->false_count();
auto length = null_bitmap_builder_->length();
@@ -357,24 +354,30 @@ class TakerImpl<IndexSequence, ListType> : public Taker<IndexSequence> {
std::shared_ptr<Array> taken_values;
RETURN_NOT_OK(value_taker_->Finish(&taken_values));
- out->reset(
- new T(this->type_, length, offsets, taken_values, null_bitmap, null_count));
+ out->reset(new ArrayType(this->type_, length, offsets, taken_values, null_bitmap,
+ null_count));
return Status::OK();
}
std::unique_ptr<TypedBufferBuilder<bool>> null_bitmap_builder_;
- std::unique_ptr<TypedBufferBuilder<int32_t>> offset_builder_;
+ std::unique_ptr<TypedBufferBuilder<offset_type>> offset_builder_;
std::unique_ptr<Taker<RangeIndexSequence>> value_taker_;
};
template <typename IndexSequence>
-class TakerImpl<IndexSequence, MapType> : public TakerImpl<IndexSequence, ListType> {
- public:
- using TakerImpl<IndexSequence, ListType>::TakerImpl;
+class TakerImpl<IndexSequence, ListType> : public ListTakerImpl<IndexSequence, ListType> {
+ using ListTakerImpl<IndexSequence, ListType>::ListTakerImpl;
+};
- Status Finish(std::shared_ptr<Array>* out) override {
- return this->template FinishAs<MapArray>(out);
- }
+template <typename IndexSequence>
+class TakerImpl<IndexSequence, LargeListType>
+ : public ListTakerImpl<IndexSequence, LargeListType> {
+ using ListTakerImpl<IndexSequence, LargeListType>::ListTakerImpl;
+};
+
+template <typename IndexSequence>
+class TakerImpl<IndexSequence, MapType> : public ListTakerImpl<IndexSequence, MapType> {
+ using ListTakerImpl<IndexSequence, MapType>::ListTakerImpl;
};
template <typename IndexSequence>
diff --git a/cpp/src/arrow/compute/kernels/take-test.cc b/cpp/src/arrow/compute/kernels/take-test.cc
index 7ae9321..6a8e30b 100644
--- a/cpp/src/arrow/compute/kernels/take-test.cc
+++ b/cpp/src/arrow/compute/kernels/take-test.cc
@@ -261,6 +261,15 @@ TEST_F(TestTakeKernelWithList, TakeListListInt32) {
"[[], [], [], [], [], [], [[1], [2, null, 2], []]]");
}
+class TestTakeKernelWithLargeList : public TestTakeKernel<LargeListType> {};
+
+TEST_F(TestTakeKernelWithLargeList, TakeLargeListInt32) {
+ std::string list_json = "[[], [1,2], null, [3]]";
+ this->AssertTake(large_list(int32()), list_json, "[]", "[]");
+ this->AssertTake(large_list(int32()), list_json, "[null, 1, 2, 0]",
+ "[null, [1,2], null, []]");
+}
+
class TestTakeKernelWithFixedSizeList : public TestTakeKernel<FixedSizeListType> {};
TEST_F(TestTakeKernelWithFixedSizeList, TakeFixedSizeListInt32) {
diff --git a/cpp/src/arrow/extension_type.cc b/cpp/src/arrow/extension_type.cc
index b08f974..b51804c 100644
--- a/cpp/src/arrow/extension_type.cc
+++ b/cpp/src/arrow/extension_type.cc
@@ -43,8 +43,6 @@ std::string ExtensionType::ToString() const {
return ss.str();
}
-std::string ExtensionType::name() const { return "extension"; }
-
ExtensionArray::ExtensionArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
ExtensionArray::ExtensionArray(const std::shared_ptr<DataType>& type,
diff --git a/cpp/src/arrow/extension_type.h b/cpp/src/arrow/extension_type.h
index 8bf4639..37b749f 100644
--- a/cpp/src/arrow/extension_type.h
+++ b/cpp/src/arrow/extension_type.h
@@ -34,13 +34,16 @@ class ARROW_EXPORT ExtensionType : public DataType {
public:
static constexpr Type::type type_id = Type::EXTENSION;
+ static constexpr const char* type_name() { return "extension"; }
+
/// \brief The type of array used to represent this extension type's data
std::shared_ptr<DataType> storage_type() const { return storage_type_; }
DataTypeLayout layout() const override;
std::string ToString() const override;
- std::string name() const override;
+
+ std::string name() const override { return "extension"; }
/// \brief Unique name of extension type used to identify type for
/// serialization
diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc
index 49a884e..ddd68e2 100644
--- a/cpp/src/arrow/ipc/json-internal.cc
+++ b/cpp/src/arrow/ipc/json-internal.cc
@@ -165,7 +165,7 @@ class SchemaWriter {
template <typename T>
typename std::enable_if<std::is_base_of<NoExtraMeta, T>::value ||
- std::is_base_of<ListType, T>::value ||
+ std::is_base_of<BaseListType, T>::value ||
std::is_base_of<StructType, T>::value,
void>::type
WriteTypeMetadata(const T& type) {}
@@ -334,6 +334,11 @@ class SchemaWriter {
return Status::OK();
}
+ Status Visit(const LargeListType& type) {
+ WriteName("large_list", type);
+ return Status::OK();
+ }
+
Status Visit(const MapType& type) {
WriteName("map", type);
return Status::OK();
@@ -583,11 +588,14 @@ class ArrayWriter {
return VisitArrayValues(*array.indices());
}
- Status Visit(const ListArray& array) {
+ template <typename T>
+ typename std::enable_if<std::is_base_of<ListArray, T>::value ||
+ std::is_base_of<LargeListArray, T>::value,
+ Status>::type
+ Visit(const T& array) {
WriteValidityField(array);
WriteIntegerField("OFFSET", array.raw_value_offsets(), array.length() + 1);
- const auto& type = checked_cast<const ListType&>(*array.type());
- return WriteChildren(type.children(), {array.values()});
+ return WriteChildren(array.type()->children(), {array.values()});
}
Status Visit(const FixedSizeListArray& array) {
@@ -948,6 +956,11 @@ static Status GetType(const RjObject& json_type,
return Status::Invalid("List must have exactly one child");
}
*type = list(children[0]);
+ } else if (type_name == "large_list") {
+ if (children.size() != 1) {
+ return Status::Invalid("Large list must have exactly one child");
+ }
+ *type = large_list(children[0]);
} else if (type_name == "map") {
return GetMap(json_type, children, type);
} else if (type_name == "fixedsizelist") {
@@ -1263,15 +1276,23 @@ class ArrayReader {
T* values = reinterpret_cast<T*>(buffer->mutable_data());
for (int i = 0; i < length; ++i) {
const rj::Value& val = json_array[i];
- DCHECK(val.IsInt());
- values[i] = static_cast<T>(val.GetInt());
+ DCHECK(val.IsInt() || val.IsInt64());
+ if (val.IsInt()) {
+ values[i] = static_cast<T>(val.GetInt());
+ } else {
+ values[i] = static_cast<T>(val.GetInt64());
+ }
}
*out = buffer;
return Status::OK();
}
+ template <typename T>
Status CreateList(const std::shared_ptr<DataType>& type, std::shared_ptr<Array>* out) {
+ using offset_type = typename T::offset_type;
+ using ArrayType = typename TypeTraits<T>::ArrayType;
+
int32_t null_count = 0;
std::shared_ptr<Buffer> validity_buffer;
RETURN_NOT_OK(GetValidityBuffer(is_valid_, &null_count, &validity_buffer));
@@ -1279,19 +1300,23 @@ class ArrayReader {
const auto& json_offsets = obj_.FindMember("OFFSET");
RETURN_NOT_ARRAY("OFFSET", json_offsets, obj_);
std::shared_ptr<Buffer> offsets_buffer;
- RETURN_NOT_OK(GetIntArray<int32_t>(json_offsets->value.GetArray(), length_ + 1,
- &offsets_buffer));
+ RETURN_NOT_OK(GetIntArray<offset_type>(json_offsets->value.GetArray(), length_ + 1,
+ &offsets_buffer));
std::vector<std::shared_ptr<Array>> children;
RETURN_NOT_OK(GetChildren(obj_, *type, &children));
DCHECK_EQ(children.size(), 1);
- out->reset(new ListArray(type, length_, offsets_buffer, children[0], validity_buffer,
+ out->reset(new ArrayType(type, length_, offsets_buffer, children[0], validity_buffer,
null_count));
return Status::OK();
}
- Status Visit(const ListType& type) { return CreateList(type_, &result_); }
+ Status Visit(const ListType& type) { return CreateList<ListType>(type_, &result_); }
+
+ Status Visit(const LargeListType& type) {
+ return CreateList<LargeListType>(type_, &result_);
+ }
Status Visit(const MapType& type) {
auto list_type = std::make_shared<ListType>(field(
@@ -1299,7 +1324,7 @@ class ArrayReader {
struct_({field("key", type.key_type(), false), field("value", type.item_type())}),
false));
std::shared_ptr<Array> list_array;
- RETURN_NOT_OK(CreateList(list_type, &list_array));
+ RETURN_NOT_OK(CreateList<ListType>(list_type, &list_array));
auto map_data = list_array->data();
map_data->type = type_;
result_ = std::make_shared<MapArray>(map_data);
diff --git a/cpp/src/arrow/ipc/json-simple-test.cc b/cpp/src/arrow/ipc/json-simple-test.cc
index 77ab770..c202402 100644
--- a/cpp/src/arrow/ipc/json-simple-test.cc
+++ b/cpp/src/arrow/ipc/json-simple-test.cc
@@ -551,6 +551,21 @@ TEST(TestList, IntegerListList) {
}
}
+TEST(TestLargeList, Basics) {
+ // Similar as TestList above, only testing the basics
+ auto pool = default_memory_pool();
+ std::shared_ptr<DataType> type = large_list(int16());
+ std::shared_ptr<Array> offsets, values, expected, actual;
+
+ ASSERT_OK(ArrayFromJSON(type, "[[], [null], [6, null]]", &actual));
+ ASSERT_OK(ValidateArray(*actual));
+ ArrayFromVector<Int64Type>({0, 0, 1, 3}, &offsets);
+ auto is_valid = std::vector<bool>{false, true, false};
+ ArrayFromVector<Int16Type>(is_valid, {0, 6, 0}, &values);
+ ASSERT_OK(LargeListArray::FromArrays(*offsets, *values, pool, &expected));
+ AssertArraysEqual(*expected, *actual);
+}
+
TEST(TestMap, IntegerToInteger) {
auto type = map(int16(), int16());
std::shared_ptr<Array> expected, actual;
diff --git a/cpp/src/arrow/ipc/json-simple.cc b/cpp/src/arrow/ipc/json-simple.cc
index 20ac025..d13d346 100644
--- a/cpp/src/arrow/ipc/json-simple.cc
+++ b/cpp/src/arrow/ipc/json-simple.cc
@@ -415,15 +415,19 @@ class FixedSizeBinaryConverter final
// ------------------------------------------------------------------------
// Converter for list arrays
-class ListConverter final : public ConcreteConverter<ListConverter> {
+template <typename TYPE>
+class ListConverter final : public ConcreteConverter<ListConverter<TYPE>> {
public:
- explicit ListConverter(const std::shared_ptr<DataType>& type) { type_ = type; }
+ using BuilderType = typename TypeTraits<TYPE>::BuilderType;
+
+ explicit ListConverter(const std::shared_ptr<DataType>& type) { this->type_ = type; }
Status Init() override {
- const auto& list_type = checked_cast<const ListType&>(*type_);
+ const auto& list_type = checked_cast<const TYPE&>(*this->type_);
RETURN_NOT_OK(GetConverter(list_type.value_type(), &child_converter_));
auto child_builder = child_converter_->builder();
- builder_ = std::make_shared<ListBuilder>(default_memory_pool(), child_builder, type_);
+ builder_ =
+ std::make_shared<BuilderType>(default_memory_pool(), child_builder, this->type_);
return Status::OK();
}
@@ -441,7 +445,7 @@ class ListConverter final : public ConcreteConverter<ListConverter> {
std::shared_ptr<ArrayBuilder> builder() override { return builder_; }
private:
- std::shared_ptr<ListBuilder> builder_;
+ std::shared_ptr<BuilderType> builder_;
std::shared_ptr<Converter> child_converter_;
};
@@ -734,7 +738,8 @@ Status GetConverter(const std::shared_ptr<DataType>& type,
SIMPLE_CONVERTER_CASE(Type::BOOL, BooleanConverter)
SIMPLE_CONVERTER_CASE(Type::FLOAT, FloatConverter<FloatType>)
SIMPLE_CONVERTER_CASE(Type::DOUBLE, FloatConverter<DoubleType>)
- SIMPLE_CONVERTER_CASE(Type::LIST, ListConverter)
+ SIMPLE_CONVERTER_CASE(Type::LIST, ListConverter<ListType>)
+ SIMPLE_CONVERTER_CASE(Type::LARGE_LIST, ListConverter<LargeListType>)
SIMPLE_CONVERTER_CASE(Type::MAP, MapConverter)
SIMPLE_CONVERTER_CASE(Type::FIXED_SIZE_LIST, FixedSizeListConverter)
SIMPLE_CONVERTER_CASE(Type::STRUCT, StructConverter)
diff --git a/cpp/src/arrow/ipc/json-test.cc b/cpp/src/arrow/ipc/json-test.cc
index 338552d..fd8c962 100644
--- a/cpp/src/arrow/ipc/json-test.cc
+++ b/cpp/src/arrow/ipc/json-test.cc
@@ -145,7 +145,9 @@ TEST(TestJsonSchemaWriter, FlatTypes) {
field("f16", timestamp(TimeUnit::NANO)),
field("f17", time64(TimeUnit::MICRO)),
field("f18", union_({field("u1", int8()), field("u2", time32(TimeUnit::MILLI))},
- {0, 1}, UnionMode::DENSE))};
+ {0, 1}, UnionMode::DENSE)),
+ field("f19", large_list(uint8())),
+ };
Schema schema(fields);
TestSchemaRoundTrip(schema);
@@ -194,15 +196,24 @@ TEST(TestJsonArrayWriter, NestedTypes) {
// List
std::vector<bool> list_is_valid = {true, false, true, true, true};
- std::vector<int32_t> offsets = {0, 0, 0, 1, 4, 7};
-
std::shared_ptr<Buffer> list_bitmap;
ASSERT_OK(GetBitmapFromVector(list_is_valid, &list_bitmap));
+ std::vector<int32_t> offsets = {0, 0, 0, 1, 4, 7};
std::shared_ptr<Buffer> offsets_buffer = Buffer::Wrap(offsets);
+ {
+ ListArray list_array(list(value_type), 5, offsets_buffer, values_array, list_bitmap,
+ 1);
+ TestArrayRoundTrip(list_array);
+ }
- ListArray list_array(list(value_type), 5, offsets_buffer, values_array, list_bitmap, 1);
-
- TestArrayRoundTrip(list_array);
+ // LargeList
+ std::vector<int64_t> large_offsets = {0, 0, 0, 1, 4, 7};
+ std::shared_ptr<Buffer> large_offsets_buffer = Buffer::Wrap(large_offsets);
+ {
+ LargeListArray list_array(large_list(value_type), 5, large_offsets_buffer,
+ values_array, list_bitmap, 1);
+ TestArrayRoundTrip(list_array);
+ }
// Map
auto map_type = map(utf8(), int32());
diff --git a/cpp/src/arrow/ipc/metadata-internal.cc b/cpp/src/arrow/ipc/metadata-internal.cc
index 93f859a..c9e1bd7 100644
--- a/cpp/src/arrow/ipc/metadata-internal.cc
+++ b/cpp/src/arrow/ipc/metadata-internal.cc
@@ -322,6 +322,12 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data,
}
*out = std::make_shared<ListType>(children[0]);
return Status::OK();
+ case flatbuf::Type_LargeList:
+ if (children.size() != 1) {
+ return Status::Invalid("LargeList must have exactly 1 child field");
+ }
+ *out = std::make_shared<LargeListType>(children[0]);
+ return Status::OK();
case flatbuf::Type_Map:
if (children.size() != 1) {
return Status::Invalid("Map must have exactly 1 child field");
@@ -640,6 +646,13 @@ class FieldToFlatbufferVisitor {
return Status::OK();
}
+ Status Visit(const LargeListType& type) {
+ fb_type_ = flatbuf::Type_LargeList;
+ RETURN_NOT_OK(AppendChildFields(fbb_, type, &children_, dictionary_memo_));
+ type_offset_ = flatbuf::CreateLargeList(fbb_).Union();
+ return Status::OK();
+ }
+
Status Visit(const MapType& type) {
fb_type_ = flatbuf::Type_Map;
RETURN_NOT_OK(AppendChildFields(fbb_, type, &children_, dictionary_memo_));
diff --git a/cpp/src/arrow/ipc/reader.cc b/cpp/src/arrow/ipc/reader.cc
index 8ddec2e..4a554aa 100644
--- a/cpp/src/arrow/ipc/reader.cc
+++ b/cpp/src/arrow/ipc/reader.cc
@@ -42,6 +42,7 @@
#include "arrow/status.h"
#include "arrow/tensor.h"
#include "arrow/type.h"
+#include "arrow/type_traits.h"
#include "arrow/util/logging.h"
#include "arrow/visitor_inline.h"
@@ -213,6 +214,21 @@ class ArrayLoader {
return GetBuffer(context_->buffer_index++, &out_->buffers[2]);
}
+ template <typename TYPE>
+ Status LoadList(const TYPE& type) {
+ out_->buffers.resize(2);
+
+ RETURN_NOT_OK(LoadCommon());
+ RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &out_->buffers[1]));
+
+ const int num_children = type.num_children();
+ if (num_children != 1) {
+ return Status::Invalid("Wrong number of children: ", num_children);
+ }
+
+ return LoadChildren(type.children());
+ }
+
Status LoadChild(const Field& field, ArrayData* out) {
ArrayLoader loader(field, out, context_);
--context_->max_recursion_depth;
@@ -262,18 +278,9 @@ class ArrayLoader {
return GetBuffer(context_->buffer_index++, &out_->buffers[1]);
}
- Status Visit(const ListType& type) {
- out_->buffers.resize(2);
-
- RETURN_NOT_OK(LoadCommon());
- RETURN_NOT_OK(GetBuffer(context_->buffer_index++, &out_->buffers[1]));
-
- const int num_children = type.num_children();
- if (num_children != 1) {
- return Status::Invalid("Wrong number of children: ", num_children);
- }
-
- return LoadChildren(type.children());
+ template <typename T>
+ enable_if_base_list<T, Status> Visit(const T& type) {
+ return LoadList(type);
}
Status Visit(const FixedSizeListType& type) {
diff --git a/cpp/src/arrow/ipc/test-common.cc b/cpp/src/arrow/ipc/test-common.cc
index 4cf13ec..1cb40d9 100644
--- a/cpp/src/arrow/ipc/test-common.cc
+++ b/cpp/src/arrow/ipc/test-common.cc
@@ -72,9 +72,13 @@ Status MakeRandomInt32Array(int64_t length, bool include_nulls, MemoryPool* pool
return Status::OK();
}
-Status MakeRandomListArray(const std::shared_ptr<Array>& child_array, int num_lists,
- bool include_nulls, MemoryPool* pool,
- std::shared_ptr<Array>* out) {
+template <typename TypeClass>
+static Status MakeListArray(const std::shared_ptr<Array>& child_array, int num_lists,
+ bool include_nulls, MemoryPool* pool,
+ std::shared_ptr<Array>* out) {
+ using offset_type = typename TypeClass::offset_type;
+ using ArrayType = typename TypeTraits<TypeClass>::ArrayType;
+
// Create the null list values
std::vector<uint8_t> valid_lists(num_lists);
const double null_percent = include_nulls ? 0.1 : 0;
@@ -83,39 +87,52 @@ Status MakeRandomListArray(const std::shared_ptr<Array>& child_array, int num_li
// Create list offsets
const int max_list_size = 10;
- std::vector<int32_t> list_sizes(num_lists, 0);
- std::vector<int32_t> offsets(
+ std::vector<offset_type> list_sizes(num_lists, 0);
+ std::vector<offset_type> offsets(
num_lists + 1, 0); // +1 so we can shift for nulls. See partial sum below.
- const uint32_t seed = static_cast<uint32_t>(child_array->length());
+ const auto seed = static_cast<uint32_t>(child_array->length());
if (num_lists > 0) {
rand_uniform_int(num_lists, seed, 0, max_list_size, list_sizes.data());
// make sure sizes are consistent with null
std::transform(list_sizes.begin(), list_sizes.end(), valid_lists.begin(),
list_sizes.begin(),
- [](int32_t size, int32_t valid) { return valid == 0 ? 0 : size; });
+ [](offset_type size, uint8_t valid) { return valid == 0 ? 0 : size; });
std::partial_sum(list_sizes.begin(), list_sizes.end(), ++offsets.begin());
// Force invariants
- const int32_t child_length = static_cast<int32_t>(child_array->length());
+ const auto child_length = static_cast<offset_type>(child_array->length());
offsets[0] = 0;
std::replace_if(offsets.begin(), offsets.end(),
- [child_length](int32_t offset) { return offset > child_length; },
+ [child_length](offset_type offset) { return offset > child_length; },
child_length);
}
- offsets[num_lists] = static_cast<int32_t>(child_array->length());
+ offsets[num_lists] = static_cast<offset_type>(child_array->length());
/// TODO(wesm): Implement support for nulls in ListArray::FromArrays
std::shared_ptr<Buffer> null_bitmap, offsets_buffer;
RETURN_NOT_OK(GetBitmapFromVector(valid_lists, &null_bitmap));
RETURN_NOT_OK(CopyBufferFromVector(offsets, pool, &offsets_buffer));
- *out = std::make_shared<ListArray>(list(child_array->type()), num_lists, offsets_buffer,
- child_array, null_bitmap, kUnknownNullCount);
+ *out = std::make_shared<ArrayType>(std::make_shared<TypeClass>(child_array->type()),
+ num_lists, offsets_buffer, child_array, null_bitmap,
+ kUnknownNullCount);
return ValidateArray(**out);
}
+Status MakeRandomListArray(const std::shared_ptr<Array>& child_array, int num_lists,
+ bool include_nulls, MemoryPool* pool,
+ std::shared_ptr<Array>* out) {
+ return MakeListArray<ListType>(child_array, num_lists, include_nulls, pool, out);
+}
+
+Status MakeRandomLargeListArray(const std::shared_ptr<Array>& child_array, int num_lists,
+ bool include_nulls, MemoryPool* pool,
+ std::shared_ptr<Array>* out) {
+ return MakeListArray<LargeListType>(child_array, num_lists, include_nulls, pool, out);
+}
+
Status MakeRandomMapArray(const std::shared_ptr<Array>& key_array,
const std::shared_ptr<Array>& item_array, int num_maps,
bool include_nulls, MemoryPool* pool,
@@ -274,22 +291,24 @@ Status MakeListRecordBatch(std::shared_ptr<RecordBatch>* out) {
// Make the schema
auto f0 = field("f0", list(int32()));
auto f1 = field("f1", list(list(int32())));
- auto f2 = field("f2", int32());
+ auto f2 = field("f2", large_list(int32()));
auto schema = ::arrow::schema({f0, f1, f2});
// Example data
MemoryPool* pool = default_memory_pool();
const int length = 200;
- std::shared_ptr<Array> leaf_values, list_array, list_list_array, flat_array;
+ std::shared_ptr<Array> leaf_values, list_array, list_list_array, large_list_array;
const bool include_nulls = true;
RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &leaf_values));
RETURN_NOT_OK(
MakeRandomListArray(leaf_values, length, include_nulls, pool, &list_array));
RETURN_NOT_OK(
MakeRandomListArray(list_array, length, include_nulls, pool, &list_list_array));
- RETURN_NOT_OK(MakeRandomInt32Array(length, include_nulls, pool, &flat_array));
- *out = RecordBatch::Make(schema, length, {list_array, list_list_array, flat_array});
+ RETURN_NOT_OK(MakeRandomLargeListArray(leaf_values, length, include_nulls, pool,
+ &large_list_array));
+ *out =
+ RecordBatch::Make(schema, length, {list_array, list_list_array, large_list_array});
return Status::OK();
}
diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h
index 0ec9834..c2e56ab 100644
--- a/cpp/src/arrow/ipc/test-common.h
+++ b/cpp/src/arrow/ipc/test-common.h
@@ -49,6 +49,11 @@ Status MakeRandomListArray(const std::shared_ptr<Array>& child_array, int num_li
std::shared_ptr<Array>* out);
ARROW_EXPORT
+Status MakeRandomLargeListArray(const std::shared_ptr<Array>& child_array, int num_lists,
+ bool include_nulls, MemoryPool* pool,
+ std::shared_ptr<Array>* out);
+
+ARROW_EXPORT
Status MakeRandomMapArray(const std::shared_ptr<Array>& child_array, int num_lists,
bool include_nulls, MemoryPool* pool,
std::shared_ptr<Array>* out);
diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc
index ec37207..b5c16cd 100644
--- a/cpp/src/arrow/ipc/writer.cc
+++ b/cpp/src/arrow/ipc/writer.cc
@@ -278,16 +278,19 @@ class RecordBatchSerializer : public ArrayVisitor {
return Status::OK();
}
- Status VisitList(const ListArray& array) {
+ template <typename ArrayType>
+ Status VisitList(const ArrayType& array) {
+ using offset_type = typename ArrayType::offset_type;
+
std::shared_ptr<Buffer> value_offsets;
- RETURN_NOT_OK(GetZeroBasedValueOffsets<ListArray>(array, &value_offsets));
+ RETURN_NOT_OK(GetZeroBasedValueOffsets<ArrayType>(array, &value_offsets));
out_->body_buffers.emplace_back(value_offsets);
--max_recursion_depth_;
std::shared_ptr<Array> values = array.values();
- int32_t values_offset = 0;
- int32_t values_length = 0;
+ offset_type values_offset = 0;
+ offset_type values_length = 0;
if (value_offsets) {
values_offset = array.value_offset(0);
values_length = array.value_offset(array.length()) - values_offset;
@@ -352,6 +355,8 @@ class RecordBatchSerializer : public ArrayVisitor {
Status Visit(const ListArray& array) override { return VisitList(array); }
+ Status Visit(const LargeListArray& array) override { return VisitList(array); }
+
Status Visit(const MapArray& array) override { return VisitList(array); }
Status Visit(const FixedSizeListArray& array) override {
diff --git a/cpp/src/arrow/pretty_print-test.cc b/cpp/src/arrow/pretty_print-test.cc
index cdb230c..1eb09c7 100644
--- a/cpp/src/arrow/pretty_print-test.cc
+++ b/cpp/src/arrow/pretty_print-test.cc
@@ -349,7 +349,6 @@ TEST_F(TestPrettyPrint, BinaryType) {
TEST_F(TestPrettyPrint, ListType) {
auto list_type = list(int64());
- auto array = ArrayFromJSON(list_type, "[[null], [], null, [4, 6, 7], [2, 3]]");
static const char* ex = R"expected([
[
@@ -367,7 +366,6 @@ TEST_F(TestPrettyPrint, ListType) {
3
]
])expected";
- CheckArray(*array, {0, 10}, ex);
static const char* ex_2 = R"expected( [
[
null
@@ -384,7 +382,6 @@ TEST_F(TestPrettyPrint, ListType) {
3
]
])expected";
- CheckArray(*array, {2, 10}, ex_2);
static const char* ex_3 = R"expected([
[
null
@@ -395,6 +392,16 @@ TEST_F(TestPrettyPrint, ListType) {
3
]
])expected";
+
+ auto array = ArrayFromJSON(list_type, "[[null], [], null, [4, 6, 7], [2, 3]]");
+ CheckArray(*array, {0, 10}, ex);
+ CheckArray(*array, {2, 10}, ex_2);
+ CheckStream(*array, {0, 1}, ex_3);
+
+ list_type = large_list(int64());
+ array = ArrayFromJSON(list_type, "[[null], [], null, [4, 6, 7], [2, 3]]");
+ CheckArray(*array, {0, 10}, ex);
+ CheckArray(*array, {2, 10}, ex_2);
CheckStream(*array, {0, 1}, ex_3);
}
diff --git a/cpp/src/arrow/pretty_print.cc b/cpp/src/arrow/pretty_print.cc
index 5a54e13..88bd547 100644
--- a/cpp/src/arrow/pretty_print.cc
+++ b/cpp/src/arrow/pretty_print.cc
@@ -247,6 +247,7 @@ class ArrayPrinter : public PrettyPrinter {
template <typename T>
inline typename std::enable_if<std::is_base_of<ListArray, T>::value ||
+ std::is_base_of<LargeListArray, T>::value ||
std::is_base_of<FixedSizeListArray, T>::value,
Status>::type
WriteDataValues(const T& array) {
@@ -320,6 +321,7 @@ class ArrayPrinter : public PrettyPrinter {
std::is_base_of<BinaryArray, T>::value ||
std::is_base_of<LargeBinaryArray, T>::value ||
std::is_base_of<ListArray, T>::value ||
+ std::is_base_of<LargeListArray, T>::value ||
std::is_base_of<MapArray, T>::value ||
std::is_base_of<FixedSizeListArray, T>::value,
Status>::type
diff --git a/cpp/src/arrow/scalar.cc b/cpp/src/arrow/scalar.cc
index 9f14b0f..7c3a4ee 100644
--- a/cpp/src/arrow/scalar.cc
+++ b/cpp/src/arrow/scalar.cc
@@ -87,12 +87,12 @@ Decimal128Scalar::Decimal128Scalar(const Decimal128& value,
const std::shared_ptr<DataType>& type, bool is_valid)
: Scalar{type, is_valid}, value(value) {}
-ListScalar::ListScalar(const std::shared_ptr<Array>& value,
- const std::shared_ptr<DataType>& type, bool is_valid)
+BaseListScalar::BaseListScalar(const std::shared_ptr<Array>& value,
+ const std::shared_ptr<DataType>& type, bool is_valid)
: Scalar{type, is_valid}, value(value) {}
-ListScalar::ListScalar(const std::shared_ptr<Array>& value, bool is_valid)
- : ListScalar(value, value->type(), is_valid) {}
+BaseListScalar::BaseListScalar(const std::shared_ptr<Array>& value, bool is_valid)
+ : BaseListScalar(value, value->type(), is_valid) {}
MapScalar::MapScalar(const std::shared_ptr<Array>& keys,
const std::shared_ptr<Array>& items,
diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h
index 76aecd0..0919289 100644
--- a/cpp/src/arrow/scalar.h
+++ b/cpp/src/arrow/scalar.h
@@ -190,13 +190,21 @@ struct ARROW_EXPORT Decimal128Scalar : public Scalar {
bool is_valid = true);
};
-struct ARROW_EXPORT ListScalar : public Scalar {
+struct ARROW_EXPORT BaseListScalar : public Scalar {
std::shared_ptr<Array> value;
- ListScalar(const std::shared_ptr<Array>& value, const std::shared_ptr<DataType>& type,
- bool is_valid = true);
+ BaseListScalar(const std::shared_ptr<Array>& value,
+ const std::shared_ptr<DataType>& type, bool is_valid = true);
- explicit ListScalar(const std::shared_ptr<Array>& value, bool is_valid = true);
+ BaseListScalar(const std::shared_ptr<Array>& value, bool is_valid);
+};
+
+struct ARROW_EXPORT ListScalar : public BaseListScalar {
+ using BaseListScalar::BaseListScalar;
+};
+
+struct ARROW_EXPORT LargeListScalar : public BaseListScalar {
+ using BaseListScalar::BaseListScalar;
};
struct ARROW_EXPORT MapScalar : public Scalar {
diff --git a/cpp/src/arrow/type-test.cc b/cpp/src/arrow/type-test.cc
index 7bfb720..eb49227 100644
--- a/cpp/src/arrow/type-test.cc
+++ b/cpp/src/arrow/type-test.cc
@@ -404,6 +404,26 @@ TEST(TestListType, Basics) {
ASSERT_EQ("list<item: list<item: string>>", lt2.ToString());
}
+TEST(TestLargeListType, Basics) {
+ std::shared_ptr<DataType> vt = std::make_shared<UInt8Type>();
+
+ LargeListType list_type(vt);
+ ASSERT_EQ(list_type.id(), Type::LARGE_LIST);
+
+ ASSERT_EQ("large_list", list_type.name());
+ ASSERT_EQ("large_list<item: uint8>", list_type.ToString());
+
+ ASSERT_EQ(list_type.value_type()->id(), vt->id());
+ ASSERT_EQ(list_type.value_type()->id(), vt->id());
+
+ std::shared_ptr<DataType> st = std::make_shared<StringType>();
+ std::shared_ptr<DataType> lt = std::make_shared<LargeListType>(st);
+ ASSERT_EQ("large_list<item: string>", lt->ToString());
+
+ LargeListType lt2(lt);
+ ASSERT_EQ("large_list<item: large_list<item: string>>", lt2.ToString());
+}
+
TEST(TestMapType, Basics) {
std::shared_ptr<DataType> kt = std::make_shared<StringType>();
std::shared_ptr<DataType> it = std::make_shared<UInt8Type>();
@@ -563,6 +583,21 @@ TEST(TestTimestampType, ToString) {
ASSERT_EQ("timestamp[us]", t4->ToString());
}
+TEST(TestListType, Equals) {
+ auto t1 = list(utf8());
+ auto t2 = list(utf8());
+ auto t3 = list(binary());
+ auto t4 = large_list(binary());
+ auto t5 = large_list(binary());
+ auto t6 = large_list(float64());
+
+ ASSERT_TRUE(t1->Equals(t2));
+ ASSERT_FALSE(t1->Equals(t3));
+ ASSERT_FALSE(t3->Equals(t4));
+ ASSERT_TRUE(t4->Equals(t5));
+ ASSERT_FALSE(t5->Equals(t6));
+}
+
TEST(TestNestedType, Equals) {
auto create_struct = [](std::string inner_name,
std::string struct_name) -> std::shared_ptr<Field> {
diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc
index dc00a79..d8ed7bb 100644
--- a/cpp/src/arrow/type.cc
+++ b/cpp/src/arrow/type.cc
@@ -149,6 +149,12 @@ std::string ListType::ToString() const {
return s.str();
}
+std::string LargeListType::ToString() const {
+ std::stringstream s;
+ s << "large_list<" << value_field()->ToString() << ">";
+ return s.str();
+}
+
MapType::MapType(const std::shared_ptr<DataType>& key_type,
const std::shared_ptr<DataType>& item_type, bool keys_sorted)
: ListType(std::make_shared<Field>(
@@ -721,6 +727,14 @@ std::shared_ptr<DataType> list(const std::shared_ptr<Field>& value_field) {
return std::make_shared<ListType>(value_field);
}
+std::shared_ptr<DataType> large_list(const std::shared_ptr<DataType>& value_type) {
+ return std::make_shared<LargeListType>(value_type);
+}
+
+std::shared_ptr<DataType> large_list(const std::shared_ptr<Field>& value_field) {
+ return std::make_shared<LargeListType>(value_field);
+}
+
std::shared_ptr<DataType> map(const std::shared_ptr<DataType>& key_type,
const std::shared_ptr<DataType>& value_type,
bool keys_sorted) {
diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h
index 572b888..753c73e 100644
--- a/cpp/src/arrow/type.h
+++ b/cpp/src/arrow/type.h
@@ -149,7 +149,10 @@ struct Type {
LARGE_STRING,
/// Like BINARY, but with 64-bit offsets
- LARGE_BINARY
+ LARGE_BINARY,
+
+ /// Like LIST, but with 64-bit offsets
+ LARGE_LIST
};
};
@@ -348,6 +351,8 @@ class ARROW_EXPORT CTypeImpl : public BASE {
DataTypeLayout layout() const override { return {{1, bit_width()}, false}; }
+ std::string name() const override { return DERIVED::type_name(); }
+
std::string ToString() const override { return this->name(); }
};
@@ -363,6 +368,8 @@ class ARROW_EXPORT NullType : public DataType, public NoExtraMeta {
public:
static constexpr Type::type type_id = Type::NA;
+ static constexpr const char* type_name() { return "null"; }
+
NullType() : DataType(Type::NA) {}
std::string ToString() const override;
@@ -379,6 +386,8 @@ class ARROW_EXPORT BooleanType : public FixedWidthType, public NoExtraMeta {
public:
static constexpr Type::type type_id = Type::BOOL;
+ static constexpr const char* type_name() { return "bool"; }
+
BooleanType() : FixedWidthType(Type::BOOL) {}
std::string ToString() const override;
@@ -386,6 +395,7 @@ class ARROW_EXPORT BooleanType : public FixedWidthType, public NoExtraMeta {
DataTypeLayout layout() const override { return {{1, 1}, false}; }
int bit_width() const override { return 1; }
+
std::string name() const override { return "bool"; }
};
@@ -393,56 +403,56 @@ class ARROW_EXPORT BooleanType : public FixedWidthType, public NoExtraMeta {
class ARROW_EXPORT UInt8Type
: public detail::IntegerTypeImpl<UInt8Type, Type::UINT8, uint8_t> {
public:
- std::string name() const override { return "uint8"; }
+ static constexpr const char* type_name() { return "uint8"; }
};
/// Concrete type class for signed 8-bit integer data
class ARROW_EXPORT Int8Type
: public detail::IntegerTypeImpl<Int8Type, Type::INT8, int8_t> {
public:
- std::string name() const override { return "int8"; }
+ static constexpr const char* type_name() { return "int8"; }
};
/// Concrete type class for unsigned 16-bit integer data
class ARROW_EXPORT UInt16Type
: public detail::IntegerTypeImpl<UInt16Type, Type::UINT16, uint16_t> {
public:
- std::string name() const override { return "uint16"; }
+ static constexpr const char* type_name() { return "uint16"; }
};
/// Concrete type class for signed 16-bit integer data
class ARROW_EXPORT Int16Type
: public detail::IntegerTypeImpl<Int16Type, Type::INT16, int16_t> {
public:
- std::string name() const override { return "int16"; }
+ static constexpr const char* type_name() { return "int16"; }
};
/// Concrete type class for unsigned 32-bit integer data
class ARROW_EXPORT UInt32Type
: public detail::IntegerTypeImpl<UInt32Type, Type::UINT32, uint32_t> {
public:
- std::string name() const override { return "uint32"; }
+ static constexpr const char* type_name() { return "uint32"; }
};
/// Concrete type class for signed 32-bit integer data
class ARROW_EXPORT Int32Type
: public detail::IntegerTypeImpl<Int32Type, Type::INT32, int32_t> {
public:
- std::string name() const override { return "int32"; }
+ static constexpr const char* type_name() { return "int32"; }
};
/// Concrete type class for unsigned 64-bit integer data
class ARROW_EXPORT UInt64Type
: public detail::IntegerTypeImpl<UInt64Type, Type::UINT64, uint64_t> {
public:
- std::string name() const override { return "uint64"; }
+ static constexpr const char* type_name() { return "uint64"; }
};
/// Concrete type class for signed 64-bit integer data
class ARROW_EXPORT Int64Type
: public detail::IntegerTypeImpl<Int64Type, Type::INT64, int64_t> {
public:
- std::string name() const override { return "int64"; }
+ static constexpr const char* type_name() { return "int64"; }
};
/// Concrete type class for 16-bit floating-point data
@@ -451,7 +461,7 @@ class ARROW_EXPORT HalfFloatType
uint16_t> {
public:
Precision precision() const override;
- std::string name() const override { return "halffloat"; }
+ static constexpr const char* type_name() { return "halffloat"; }
};
/// Concrete type class for 32-bit floating-point data (C "float")
@@ -459,7 +469,7 @@ class ARROW_EXPORT FloatType
: public detail::CTypeImpl<FloatType, FloatingPointType, Type::FLOAT, float> {
public:
Precision precision() const override;
- std::string name() const override { return "float"; }
+ static constexpr const char* type_name() { return "float"; }
};
/// Concrete type class for 64-bit floating-point data (C "double")
@@ -467,7 +477,13 @@ class ARROW_EXPORT DoubleType
: public detail::CTypeImpl<DoubleType, FloatingPointType, Type::DOUBLE, double> {
public:
Precision precision() const override;
- std::string name() const override { return "double"; }
+ static constexpr const char* type_name() { return "double"; }
+};
+
+/// \brief Base class for all variable-size list data types
+class ARROW_EXPORT BaseListType : public NestedType {
+ public:
+ using NestedType::NestedType;
};
/// \brief Concrete type class for list data
@@ -475,16 +491,18 @@ class ARROW_EXPORT DoubleType
/// List data is nested data where each value is a variable number of
/// child items. Lists can be recursively nested, for example
/// list(list(int32)).
-class ARROW_EXPORT ListType : public NestedType {
+class ARROW_EXPORT ListType : public BaseListType {
public:
static constexpr Type::type type_id = Type::LIST;
using offset_type = int32_t;
+ static constexpr const char* type_name() { return "list"; }
+
// List can contain any other logical value type
explicit ListType(const std::shared_ptr<DataType>& value_type)
: ListType(std::make_shared<Field>("item", value_type)) {}
- explicit ListType(const std::shared_ptr<Field>& value_field) : NestedType(Type::LIST) {
+ explicit ListType(const std::shared_ptr<Field>& value_field) : BaseListType(type_id) {
children_ = {value_field};
}
@@ -501,6 +519,38 @@ class ARROW_EXPORT ListType : public NestedType {
std::string name() const override { return "list"; }
};
+/// \brief Concrete type class for large list data
+///
+/// LargeListType is like ListType but with 64-bit rather than 32-bit offsets.
+class ARROW_EXPORT LargeListType : public BaseListType {
+ public:
+ static constexpr Type::type type_id = Type::LARGE_LIST;
+ using offset_type = int64_t;
+
+ static constexpr const char* type_name() { return "large_list"; }
+
+ // List can contain any other logical value type
+ explicit LargeListType(const std::shared_ptr<DataType>& value_type)
+ : LargeListType(std::make_shared<Field>("item", value_type)) {}
+
+ explicit LargeListType(const std::shared_ptr<Field>& value_field)
+ : BaseListType(type_id) {
+ children_ = {value_field};
+ }
+
+ std::shared_ptr<Field> value_field() const { return children_[0]; }
+
+ std::shared_ptr<DataType> value_type() const { return children_[0]->type(); }
+
+ DataTypeLayout layout() const override {
+ return {{1, CHAR_BIT * sizeof(offset_type)}, false};
+ }
+
+ std::string ToString() const override;
+
+ std::string name() const override { return "large_list"; }
+};
+
/// \brief Concrete type class for map data
///
/// Map data is nested data where each value is a variable number of
@@ -510,6 +560,8 @@ class ARROW_EXPORT MapType : public ListType {
public:
static constexpr Type::type type_id = Type::MAP;
+ static constexpr const char* type_name() { return "map"; }
+
MapType(const std::shared_ptr<DataType>& key_type,
const std::shared_ptr<DataType>& item_type, bool keys_sorted = false);
@@ -532,6 +584,8 @@ class ARROW_EXPORT FixedSizeListType : public NestedType {
public:
static constexpr Type::type type_id = Type::FIXED_SIZE_LIST;
+ static constexpr const char* type_name() { return "fixed_size_list"; }
+
// List can contain any other logical value type
FixedSizeListType(const std::shared_ptr<DataType>& value_type, int32_t list_size)
: FixedSizeListType(std::make_shared<Field>("item", value_type), list_size) {}
@@ -570,6 +624,8 @@ class ARROW_EXPORT BinaryType : public BaseBinaryType {
static constexpr bool is_utf8 = false;
using offset_type = int32_t;
+ static constexpr const char* type_name() { return "binary"; }
+
BinaryType() : BinaryType(Type::BINARY) {}
DataTypeLayout layout() const override {
@@ -592,6 +648,8 @@ class ARROW_EXPORT LargeBinaryType : public BaseBinaryType {
static constexpr bool is_utf8 = false;
using offset_type = int64_t;
+ static constexpr const char* type_name() { return "large_binary"; }
+
LargeBinaryType() : LargeBinaryType(Type::LARGE_BINARY) {}
DataTypeLayout layout() const override {
@@ -613,6 +671,8 @@ class ARROW_EXPORT StringType : public BinaryType {
static constexpr Type::type type_id = Type::STRING;
static constexpr bool is_utf8 = true;
+ static constexpr const char* type_name() { return "utf8"; }
+
StringType() : BinaryType(Type::STRING) {}
std::string ToString() const override;
@@ -625,6 +685,8 @@ class ARROW_EXPORT LargeStringType : public LargeBinaryType {
static constexpr Type::type type_id = Type::LARGE_STRING;
static constexpr bool is_utf8 = true;
+ static constexpr const char* type_name() { return "large_utf8"; }
+
LargeStringType() : LargeBinaryType(Type::LARGE_STRING) {}
std::string ToString() const override;
@@ -636,6 +698,8 @@ class ARROW_EXPORT FixedSizeBinaryType : public FixedWidthType, public Parametri
public:
static constexpr Type::type type_id = Type::FIXED_SIZE_BINARY;
+ static constexpr const char* type_name() { return "fixed_size_binary"; }
+
explicit FixedSizeBinaryType(int32_t byte_width)
: FixedWidthType(Type::FIXED_SIZE_BINARY), byte_width_(byte_width) {}
explicit FixedSizeBinaryType(int32_t byte_width, Type::type override_type_id)
@@ -658,6 +722,8 @@ class ARROW_EXPORT StructType : public NestedType {
public:
static constexpr Type::type type_id = Type::STRUCT;
+ static constexpr const char* type_name() { return "struct"; }
+
explicit StructType(const std::vector<std::shared_ptr<Field>>& fields);
~StructType() override;
@@ -712,6 +778,8 @@ class ARROW_EXPORT Decimal128Type : public DecimalType {
public:
static constexpr Type::type type_id = Type::DECIMAL;
+ static constexpr const char* type_name() { return "decimal"; }
+
explicit Decimal128Type(int32_t precision, int32_t scale);
std::string ToString() const override;
@@ -727,6 +795,8 @@ class ARROW_EXPORT UnionType : public NestedType {
public:
static constexpr Type::type type_id = Type::UNION;
+ static constexpr const char* type_name() { return "union"; }
+
UnionType(const std::vector<std::shared_ptr<Field>>& fields,
const std::vector<uint8_t>& type_codes,
UnionMode::type mode = UnionMode::SPARSE);
@@ -779,6 +849,8 @@ class ARROW_EXPORT Date32Type : public DateType {
static constexpr Type::type type_id = Type::DATE32;
static constexpr DateUnit UNIT = DateUnit::DAY;
+ static constexpr const char* type_name() { return "date32"; }
+
using c_type = int32_t;
Date32Type();
@@ -797,6 +869,8 @@ class ARROW_EXPORT Date64Type : public DateType {
static constexpr Type::type type_id = Type::DATE64;
static constexpr DateUnit UNIT = DateUnit::MILLI;
+ static constexpr const char* type_name() { return "date64"; }
+
using c_type = int64_t;
Date64Type();
@@ -834,6 +908,8 @@ class ARROW_EXPORT Time32Type : public TimeType {
static constexpr Type::type type_id = Type::TIME32;
using c_type = int32_t;
+ static constexpr const char* type_name() { return "time32"; }
+
int bit_width() const override { return static_cast<int>(sizeof(c_type) * CHAR_BIT); }
explicit Time32Type(TimeUnit::type unit = TimeUnit::MILLI);
@@ -850,6 +926,8 @@ class ARROW_EXPORT Time64Type : public TimeType {
static constexpr Type::type type_id = Type::TIME64;
using c_type = int64_t;
+ static constexpr const char* type_name() { return "time64"; }
+
int bit_width() const override { return static_cast<int>(sizeof(c_type) * CHAR_BIT); }
explicit Time64Type(TimeUnit::type unit = TimeUnit::NANO);
@@ -898,6 +976,8 @@ class ARROW_EXPORT TimestampType : public TemporalType, public ParametricType {
typedef int64_t c_type;
static constexpr Type::type type_id = Type::TIMESTAMP;
+ static constexpr const char* type_name() { return "timestamp"; }
+
int bit_width() const override { return static_cast<int>(sizeof(int64_t) * CHAR_BIT); }
explicit TimestampType(TimeUnit::type unit = TimeUnit::MILLI)
@@ -936,6 +1016,8 @@ class ARROW_EXPORT MonthIntervalType : public IntervalType {
using c_type = int32_t;
static constexpr Type::type type_id = Type::INTERVAL;
+ static constexpr const char* type_name() { return "month_interval"; }
+
IntervalType::type interval_type() const override { return IntervalType::MONTHS; }
int bit_width() const override { return static_cast<int>(sizeof(c_type) * CHAR_BIT); }
@@ -961,6 +1043,9 @@ class ARROW_EXPORT DayTimeIntervalType : public IntervalType {
static_assert(sizeof(DayMilliseconds) == 8,
"DayMilliseconds struct assumed to be of size 8 bytes");
static constexpr Type::type type_id = Type::INTERVAL;
+
+ static constexpr const char* type_name() { return "day_time_interval"; }
+
IntervalType::type interval_type() const override { return IntervalType::DAY_TIME; }
DayTimeIntervalType() : IntervalType() {}
@@ -980,6 +1065,8 @@ class ARROW_EXPORT DurationType : public TemporalType, public ParametricType {
static constexpr Type::type type_id = Type::DURATION;
using c_type = int64_t;
+ static constexpr const char* type_name() { return "duration"; }
+
int bit_width() const override { return static_cast<int>(sizeof(int64_t) * CHAR_BIT); }
explicit DurationType(TimeUnit::type unit = TimeUnit::MILLI)
@@ -1004,6 +1091,8 @@ class ARROW_EXPORT DictionaryType : public FixedWidthType {
public:
static constexpr Type::type type_id = Type::DICTIONARY;
+ static constexpr const char* type_name() { return "dictionary"; }
+
DictionaryType(const std::shared_ptr<DataType>& index_type,
const std::shared_ptr<DataType>& value_type, bool ordered = false);
@@ -1146,6 +1235,14 @@ std::shared_ptr<DataType> list(const std::shared_ptr<Field>& value_type);
ARROW_EXPORT
std::shared_ptr<DataType> list(const std::shared_ptr<DataType>& value_type);
+/// \brief Create a LargeListType instance from its child Field type
+ARROW_EXPORT
+std::shared_ptr<DataType> large_list(const std::shared_ptr<Field>& value_type);
+
+/// \brief Create a LargeListType instance from its child DataType
+ARROW_EXPORT
+std::shared_ptr<DataType> large_list(const std::shared_ptr<DataType>& value_type);
+
/// \brief Create a MapType instance from its key and value DataTypes
ARROW_EXPORT
std::shared_ptr<DataType> map(const std::shared_ptr<DataType>& key_type,
diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h
index 9935af5..0711efc 100644
--- a/cpp/src/arrow/type_fwd.h
+++ b/cpp/src/arrow/type_fwd.h
@@ -90,6 +90,11 @@ class ListArray;
class ListBuilder;
struct ListScalar;
+class LargeListType;
+class LargeListArray;
+class LargeListBuilder;
+struct LargeListScalar;
+
class MapType;
class MapArray;
class MapBuilder;
diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h
index df3e280..2d05b5c 100644
--- a/cpp/src/arrow/type_traits.h
+++ b/cpp/src/arrow/type_traits.h
@@ -299,6 +299,20 @@ struct TypeTraits<ListType> {
using ArrayType = ListArray;
using BuilderType = ListBuilder;
using ScalarType = ListScalar;
+ using OffsetType = Int32Type;
+ using OffsetArrayType = Int32Array;
+ using OffsetBuilderType = Int32Builder;
+ constexpr static bool is_parameter_free = false;
+};
+
+template <>
+struct TypeTraits<LargeListType> {
+ using ArrayType = LargeListArray;
+ using BuilderType = LargeListBuilder;
+ using ScalarType = LargeListScalar;
+ using OffsetType = Int64Type;
+ using OffsetArrayType = Int64Array;
+ using OffsetBuilderType = Int64Builder;
constexpr static bool is_parameter_free = false;
};
@@ -469,10 +483,18 @@ using enable_if_fixed_size_binary =
typename std::enable_if<std::is_base_of<FixedSizeBinaryType, T>::value, R>::type;
template <typename T, typename R = void>
+using enable_if_base_list =
+ typename std::enable_if<std::is_base_of<BaseListType, T>::value, R>::type;
+
+template <typename T, typename R = void>
using enable_if_list =
typename std::enable_if<std::is_base_of<ListType, T>::value, R>::type;
template <typename T, typename R = void>
+using enable_if_large_list =
+ typename std::enable_if<std::is_base_of<LargeListType, T>::value, R>::type;
+
+template <typename T, typename R = void>
using enable_if_fixed_size_list =
typename std::enable_if<std::is_base_of<FixedSizeListType, T>::value, R>::type;
diff --git a/cpp/src/arrow/visitor.cc b/cpp/src/arrow/visitor.cc
index 2ec6c64..cb4f165 100644
--- a/cpp/src/arrow/visitor.cc
+++ b/cpp/src/arrow/visitor.cc
@@ -59,6 +59,7 @@ ARRAY_VISITOR_DEFAULT(DayTimeIntervalArray)
ARRAY_VISITOR_DEFAULT(MonthIntervalArray)
ARRAY_VISITOR_DEFAULT(DurationArray)
ARRAY_VISITOR_DEFAULT(ListArray)
+ARRAY_VISITOR_DEFAULT(LargeListArray)
ARRAY_VISITOR_DEFAULT(MapArray)
ARRAY_VISITOR_DEFAULT(FixedSizeListArray)
ARRAY_VISITOR_DEFAULT(StructArray)
@@ -105,6 +106,7 @@ TYPE_VISITOR_DEFAULT(MonthIntervalType)
TYPE_VISITOR_DEFAULT(DurationType)
TYPE_VISITOR_DEFAULT(Decimal128Type)
TYPE_VISITOR_DEFAULT(ListType)
+TYPE_VISITOR_DEFAULT(LargeListType)
TYPE_VISITOR_DEFAULT(MapType)
TYPE_VISITOR_DEFAULT(FixedSizeListType)
TYPE_VISITOR_DEFAULT(StructType)
@@ -151,6 +153,7 @@ SCALAR_VISITOR_DEFAULT(MonthIntervalScalar)
SCALAR_VISITOR_DEFAULT(DurationScalar)
SCALAR_VISITOR_DEFAULT(Decimal128Scalar)
SCALAR_VISITOR_DEFAULT(ListScalar)
+SCALAR_VISITOR_DEFAULT(LargeListScalar)
SCALAR_VISITOR_DEFAULT(MapScalar)
SCALAR_VISITOR_DEFAULT(FixedSizeListScalar)
SCALAR_VISITOR_DEFAULT(StructScalar)
diff --git a/cpp/src/arrow/visitor.h b/cpp/src/arrow/visitor.h
index 1c854c4..825c172 100644
--- a/cpp/src/arrow/visitor.h
+++ b/cpp/src/arrow/visitor.h
@@ -56,6 +56,7 @@ class ARROW_EXPORT ArrayVisitor {
virtual Status Visit(const DurationArray& array);
virtual Status Visit(const Decimal128Array& array);
virtual Status Visit(const ListArray& array);
+ virtual Status Visit(const LargeListArray& array);
virtual Status Visit(const MapArray& array);
virtual Status Visit(const FixedSizeListArray& array);
virtual Status Visit(const StructArray& array);
@@ -96,6 +97,7 @@ class ARROW_EXPORT TypeVisitor {
virtual Status Visit(const DurationType& type);
virtual Status Visit(const Decimal128Type& type);
virtual Status Visit(const ListType& type);
+ virtual Status Visit(const LargeListType& type);
virtual Status Visit(const MapType& type);
virtual Status Visit(const FixedSizeListType& type);
virtual Status Visit(const StructType& type);
@@ -136,6 +138,7 @@ class ARROW_EXPORT ScalarVisitor {
virtual Status Visit(const DurationScalar& scalar);
virtual Status Visit(const Decimal128Scalar& scalar);
virtual Status Visit(const ListScalar& scalar);
+ virtual Status Visit(const LargeListScalar& scalar);
virtual Status Visit(const MapScalar& scalar);
virtual Status Visit(const FixedSizeListScalar& scalar);
virtual Status Visit(const StructScalar& scalar);
diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h
index 3ed058e..84d89da 100644
--- a/cpp/src/arrow/visitor_inline.h
+++ b/cpp/src/arrow/visitor_inline.h
@@ -58,6 +58,7 @@ namespace arrow {
ACTION(Time64); \
ACTION(Decimal128); \
ACTION(List); \
+ ACTION(LargeList); \
ACTION(Map); \
ACTION(FixedSizeList); \
ACTION(Struct); \
diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc
index a786065..bb2bab1 100644
--- a/cpp/src/parquet/arrow/writer.cc
+++ b/cpp/src/parquet/arrow/writer.cc
@@ -111,6 +111,7 @@ class LevelBuilder {
" not supported yet"); \
}
+ NOT_IMPLEMENTED_VISIT(LargeList)
NOT_IMPLEMENTED_VISIT(Map)
NOT_IMPLEMENTED_VISIT(FixedSizeList)
NOT_IMPLEMENTED_VISIT(Struct)
@@ -118,6 +119,8 @@ class LevelBuilder {
NOT_IMPLEMENTED_VISIT(Dictionary)
NOT_IMPLEMENTED_VISIT(Extension)
+#undef NOT_IMPLEMENTED_VISIT
+
Status GenerateLevels(const Array& array, const std::shared_ptr<Field>& field,
int64_t* values_offset, int64_t* num_values, int64_t* num_levels,
const std::shared_ptr<ResizableBuffer>& def_levels_scratch,
diff --git a/format/Schema.fbs b/format/Schema.fbs
index 06bcf6e..4ce66d6 100644
--- a/format/Schema.fbs
+++ b/format/Schema.fbs
@@ -47,6 +47,11 @@ table Struct_ {
table List {
}
+/// Same as List, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+table LargeList {
+}
+
table FixedSizeList {
/// Number of list items per value
listSize: int;
@@ -248,6 +253,7 @@ union Type {
Duration,
LargeBinary,
LargeUtf8,
+ LargeList,
}
/// ----------------------------------------------------------------------