You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ne...@apache.org on 2020/11/07 12:19:28 UTC
[arrow] branch master updated: ARROW-10261: [Rust] [Breaking]
Change List datatype to Box
This is an automated email from the ASF dual-hosted git repository.
nevime pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new b1dfd96 ARROW-10261: [Rust] [Breaking] Change List datatype to Box<Field>
b1dfd96 is described below
commit b1dfd964ab4002f2d17f08e60693de907a70c049
Author: Neville Dipale <ne...@gmail.com>
AuthorDate: Sat Nov 7 14:18:38 2020 +0200
ARROW-10261: [Rust] [Breaking] Change List datatype to Box<Field>
This changes a list datatype to use `Box<Field>` instead of `Box<DataType>`.
This change is needed in order to make both Parquet and IPC roundtrips pass.
The C++ implementation uses `Field`, as it allows for preservice list field names and nullability.
There are some implementation details which we did not cover in this PR, and will work on as a follow-up, and these are:
* Documenting the naming conventions of Arrow and Parquet on list names (Arrow C++ uses "item", Parquet has different compatibility options)
* Fixing Parquet failing list tests that have been ignored (I'd like to do this separately, as this isn't a full fix)
Closes #8608 from nevi-me/ARROW-10261-BREAKING
Authored-by: Neville Dipale <ne...@gmail.com>
Signed-off-by: Neville Dipale <ne...@gmail.com>
---
rust/arrow/examples/builders.rs | 3 +-
rust/arrow/src/array/array.rs | 51 ++++--
rust/arrow/src/array/builder.rs | 89 +++++++---
rust/arrow/src/compute/kernels/cast.rs | 74 +++++---
rust/arrow/src/compute/kernels/comparison.rs | 5 +-
rust/arrow/src/compute/kernels/concat.rs | 6 +-
rust/arrow/src/compute/kernels/filter.rs | 7 +-
rust/arrow/src/compute/kernels/limit.rs | 3 +-
rust/arrow/src/compute/kernels/take.rs | 21 ++-
rust/arrow/src/compute/util.rs | 4 +-
rust/arrow/src/datatypes.rs | 101 +++++++----
rust/arrow/src/ipc/convert.rs | 112 +++++-------
rust/arrow/src/ipc/reader.rs | 8 +-
rust/arrow/src/json/reader.rs | 197 ++++++++++++++-------
rust/arrow/src/record_batch.rs | 1 +
rust/arrow/src/util/integration_util.rs | 35 ++--
rust/arrow/test/data/integration.json | 8 +-
.../src/physical_plan/distinct_expressions.rs | 2 +-
rust/datafusion/src/physical_plan/functions.rs | 6 +-
rust/datafusion/src/scalar.rs | 11 +-
rust/datafusion/tests/sql.rs | 8 +-
rust/parquet/src/arrow/array_reader.rs | 10 +-
rust/parquet/src/arrow/arrow_writer.rs | 52 ++++--
rust/parquet/src/arrow/schema.rs | 157 ++++++++++------
24 files changed, 619 insertions(+), 352 deletions(-)
diff --git a/rust/arrow/examples/builders.rs b/rust/arrow/examples/builders.rs
index 01edf2d..61cce0e 100644
--- a/rust/arrow/examples/builders.rs
+++ b/rust/arrow/examples/builders.rs
@@ -99,7 +99,8 @@ fn main() {
let value_offsets = Buffer::from(&[0, 3, 6, 8].to_byte_slice());
// Construct a list array from the above two
- let list_data_type = DataType::List(Box::new(DataType::Int32));
+ let list_data_type =
+ DataType::List(Box::new(Field::new("item", DataType::Int32, false)));
let list_data = ArrayData::builder(list_data_type)
.len(3)
.add_buffer(value_offsets)
diff --git a/rust/arrow/src/array/array.rs b/rust/arrow/src/array/array.rs
index f28d9f3..e168f75 100644
--- a/rust/arrow/src/array/array.rs
+++ b/rust/arrow/src/array/array.rs
@@ -2958,7 +2958,8 @@ mod tests {
let value_offsets = Buffer::from(&[0, 3, 6, 8].to_byte_slice());
// Construct a list array from the above two
- let list_data_type = DataType::List(Box::new(DataType::Int32));
+ let list_data_type =
+ DataType::List(Box::new(Field::new("item", DataType::Int32, false)));
let list_data = ArrayData::builder(list_data_type.clone())
.len(3)
.add_buffer(value_offsets.clone())
@@ -3027,7 +3028,8 @@ mod tests {
let value_offsets = Buffer::from(&[0i64, 3, 6, 8].to_byte_slice());
// Construct a list array from the above two
- let list_data_type = DataType::LargeList(Box::new(DataType::Int32));
+ let list_data_type =
+ DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false)));
let list_data = ArrayData::builder(list_data_type.clone())
.len(3)
.add_buffer(value_offsets.clone())
@@ -3178,7 +3180,10 @@ mod tests {
.build();
// Construct a list array from the above two
- let list_data_type = DataType::FixedSizeList(Box::new(DataType::Int32), 3);
+ let list_data_type = DataType::FixedSizeList(
+ Box::new(Field::new("item", DataType::Int32, false)),
+ 3,
+ );
let list_data = ArrayData::builder(list_data_type.clone())
.len(3)
.add_child_data(value_data.clone())
@@ -3244,7 +3249,10 @@ mod tests {
.build();
// Construct a list array from the above two
- let list_data_type = DataType::FixedSizeList(Box::new(DataType::Int32), 3);
+ let list_data_type = DataType::FixedSizeList(
+ Box::new(Field::new("item", DataType::Int32, false)),
+ 3,
+ );
let list_data = ArrayData::builder(list_data_type)
.len(3)
.add_child_data(value_data)
@@ -3275,7 +3283,8 @@ mod tests {
bit_util::set_bit(&mut null_bits, 8);
// Construct a list array from the above two
- let list_data_type = DataType::List(Box::new(DataType::Int32));
+ let list_data_type =
+ DataType::List(Box::new(Field::new("item", DataType::Int32, false)));
let list_data = ArrayData::builder(list_data_type)
.len(9)
.add_buffer(value_offsets)
@@ -3339,7 +3348,8 @@ mod tests {
bit_util::set_bit(&mut null_bits, 8);
// Construct a list array from the above two
- let list_data_type = DataType::LargeList(Box::new(DataType::Int32));
+ let list_data_type =
+ DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false)));
let list_data = ArrayData::builder(list_data_type)
.len(9)
.add_buffer(value_offsets)
@@ -3401,7 +3411,10 @@ mod tests {
bit_util::set_bit(&mut null_bits, 4);
// Construct a fixed size list array from the above two
- let list_data_type = DataType::FixedSizeList(Box::new(DataType::Int32), 2);
+ let list_data_type = DataType::FixedSizeList(
+ Box::new(Field::new("item", DataType::Int32, false)),
+ 2,
+ );
let list_data = ArrayData::builder(list_data_type)
.len(5)
.add_child_data(value_data.clone())
@@ -3449,7 +3462,8 @@ mod tests {
.len(8)
.add_buffer(Buffer::from(&[0, 1, 2, 3, 4, 5, 6, 7].to_byte_slice()))
.build();
- let list_data_type = DataType::List(Box::new(DataType::Int32));
+ let list_data_type =
+ DataType::List(Box::new(Field::new("item", DataType::Int32, false)));
let list_data = ArrayData::builder(list_data_type)
.len(3)
.add_child_data(value_data)
@@ -3463,7 +3477,8 @@ mod tests {
)]
fn test_list_array_invalid_child_array_len() {
let value_offsets = Buffer::from(&[0, 2, 5, 7].to_byte_slice());
- let list_data_type = DataType::List(Box::new(DataType::Int32));
+ let list_data_type =
+ DataType::List(Box::new(Field::new("item", DataType::Int32, false)));
let list_data = ArrayData::builder(list_data_type)
.len(3)
.add_buffer(value_offsets)
@@ -3481,7 +3496,8 @@ mod tests {
let value_offsets = Buffer::from(&[2, 2, 5, 7].to_byte_slice());
- let list_data_type = DataType::List(Box::new(DataType::Int32));
+ let list_data_type =
+ DataType::List(Box::new(Field::new("item", DataType::Int32, false)));
let list_data = ArrayData::builder(list_data_type)
.len(3)
.add_buffer(value_offsets)
@@ -3874,11 +3890,13 @@ mod tests {
.add_child_data(ArrayData::builder(DataType::Boolean).build())
.build();
- let array_data =
- ArrayData::builder(DataType::FixedSizeList(Box::new(DataType::Binary), 4))
- .len(3)
- .add_child_data(values_data)
- .build();
+ let array_data = ArrayData::builder(DataType::FixedSizeList(
+ Box::new(Field::new("item", DataType::Binary, false)),
+ 4,
+ ))
+ .len(3)
+ .add_child_data(values_data)
+ .build();
let list_array = FixedSizeListArray::from(array_data);
FixedSizeBinaryArray::from(list_array);
}
@@ -4245,7 +4263,8 @@ mod tests {
.add_buffer(Buffer::from(values.to_byte_slice()))
.build();
- let list_data_type = DataType::List(Box::new(DataType::Int32));
+ let list_data_type =
+ DataType::List(Box::new(Field::new("item", DataType::Int32, false)));
let list_data = ArrayData::builder(list_data_type)
.add_buffer(buf2)
.add_child_data(value_data)
diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs
index e93dfd1..09fb9e7 100644
--- a/rust/arrow/src/array/builder.rs
+++ b/rust/arrow/src/array/builder.rs
@@ -775,7 +775,11 @@ where
///
/// This is used for validating array data types in `append_data`
fn data_type(&self) -> DataType {
- DataType::List(Box::new(self.values_builder.data_type()))
+ DataType::List(Box::new(Field::new(
+ "item",
+ self.values_builder.data_type(),
+ true,
+ )))
}
/// Returns the builder as a mutable `Any` reference.
@@ -839,15 +843,19 @@ where
let offset_buffer = self.offsets_builder.finish();
let null_bit_buffer = self.bitmap_builder.finish();
+ let nulls = bit_util::count_set_bits(null_bit_buffer.data());
self.offsets_builder.append(0).unwrap();
- let data =
- ArrayData::builder(DataType::List(Box::new(values_data.data_type().clone())))
- .len(len)
- .null_count(len - bit_util::count_set_bits(null_bit_buffer.data()))
- .add_buffer(offset_buffer)
- .add_child_data(values_data)
- .null_bit_buffer(null_bit_buffer)
- .build();
+ let data = ArrayData::builder(DataType::List(Box::new(Field::new(
+ "item",
+ values_data.data_type().clone(),
+ true, // TODO: find a consistent way of getting this
+ ))))
+ .len(len)
+ .null_count(len - nulls)
+ .add_buffer(offset_buffer)
+ .add_child_data(values_data)
+ .null_bit_buffer(null_bit_buffer)
+ .build();
ListArray::from(data)
}
@@ -980,7 +988,11 @@ where
///
/// This is used for validating array data types in `append_data`
fn data_type(&self) -> DataType {
- DataType::LargeList(Box::new(self.values_builder.data_type()))
+ DataType::LargeList(Box::new(Field::new(
+ "item",
+ self.values_builder.data_type(),
+ true,
+ )))
}
/// Returns the builder as a mutable `Any` reference.
@@ -1044,12 +1056,15 @@ where
let offset_buffer = self.offsets_builder.finish();
let null_bit_buffer = self.bitmap_builder.finish();
+ let nulls = bit_util::count_set_bits(null_bit_buffer.data());
self.offsets_builder.append(0).unwrap();
- let data = ArrayData::builder(DataType::LargeList(Box::new(
+ let data = ArrayData::builder(DataType::LargeList(Box::new(Field::new(
+ "item",
values_data.data_type().clone(),
- )))
+ true,
+ ))))
.len(len)
- .null_count(len - bit_util::count_set_bits(null_bit_buffer.data()))
+ .null_count(len - nulls)
.add_buffer(offset_buffer)
.add_child_data(values_data)
.null_bit_buffer(null_bit_buffer)
@@ -1155,7 +1170,10 @@ where
///
/// This is used for validating array data types in `append_data`
fn data_type(&self) -> DataType {
- DataType::FixedSizeList(Box::new(self.values_builder.data_type()), self.list_len)
+ DataType::FixedSizeList(
+ Box::new(Field::new("item", self.values_builder.data_type(), true)),
+ self.list_len,
+ )
}
/// Returns the builder as a mutable `Any` reference.
@@ -1230,12 +1248,13 @@ where
}
let null_bit_buffer = self.bitmap_builder.finish();
+ let nulls = bit_util::count_set_bits(null_bit_buffer.data());
let data = ArrayData::builder(DataType::FixedSizeList(
- Box::new(values_data.data_type().clone()),
+ Box::new(Field::new("item", values_data.data_type().clone(), true)),
self.list_len,
))
.len(len)
- .null_count(len - bit_util::count_set_bits(null_bit_buffer.data()))
+ .null_count(len - nulls)
.add_child_data(values_data)
.null_bit_buffer(null_bit_buffer)
.build();
@@ -1445,7 +1464,7 @@ fn append_binary_data(
)) as ArrayDataRef;
Arc::new(ArrayData::new(
- DataType::List(Box::new(DataType::UInt8)),
+ DataType::List(Box::new(Field::new("item", DataType::UInt8, true))),
array.len(),
None,
array.null_buffer().cloned(),
@@ -1497,7 +1516,11 @@ fn append_large_binary_data(
)) as ArrayDataRef;
Arc::new(ArrayData::new(
- DataType::LargeList(Box::new(DataType::UInt8)),
+ DataType::LargeList(Box::new(Field::new(
+ "item",
+ DataType::UInt8,
+ true,
+ ))),
array.len(),
None,
array.null_buffer().cloned(),
@@ -1595,7 +1618,10 @@ impl ArrayBuilder for FixedSizeBinaryBuilder {
vec![],
)) as ArrayDataRef;
let list_data = Arc::new(ArrayData::new(
- DataType::FixedSizeList(Box::new(DataType::UInt8), self.builder.list_len),
+ DataType::FixedSizeList(
+ Box::new(Field::new("item", DataType::UInt8, true)),
+ self.builder.list_len,
+ ),
array.len(),
None,
array.null_buffer().cloned(),
@@ -3368,11 +3394,14 @@ mod tests {
}
#[test]
- #[should_panic(expected = "Data type List(Int64) is not currently supported")]
+ #[should_panic(
+ expected = "Data type List(Field { name: \"item\", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false }) is not currently supported"
+ )]
fn test_struct_array_builder_from_schema_unsupported_type() {
let mut fields = Vec::new();
fields.push(Field::new("f1", DataType::Int16, false));
- let list_type = DataType::List(Box::new(DataType::Int64));
+ let list_type =
+ DataType::List(Box::new(Field::new("item", DataType::Int64, true)));
fields.push(Field::new("f2", list_type, false));
let _ = StructBuilder::from_fields(fields, 5);
@@ -3667,7 +3696,7 @@ mod tests {
let list_value_offsets =
Buffer::from(&[0, 3, 5, 11, 13, 13, 15, 15, 17].to_byte_slice());
let expected_list_data = ArrayData::new(
- DataType::List(Box::new(DataType::Int64)),
+ DataType::List(Box::new(Field::new("item", DataType::Int64, true))),
8,
None,
None,
@@ -3753,7 +3782,7 @@ mod tests {
&[0, 3, 5, 5, 13, 15, 15, 15, 19, 19, 19, 19, 23].to_byte_slice(),
);
let expected_list_data = ArrayData::new(
- DataType::List(Box::new(DataType::Int64)),
+ DataType::List(Box::new(Field::new("item", DataType::Int64, true))),
12,
None,
None,
@@ -3795,7 +3824,7 @@ mod tests {
]);
let list_value_offsets = Buffer::from(&[0, 2, 3, 6].to_byte_slice());
let list_data = ArrayData::new(
- DataType::List(Box::new(DataType::Utf8)),
+ DataType::List(Box::new(Field::new("item", DataType::Utf8, true))),
3,
None,
None,
@@ -3830,7 +3859,7 @@ mod tests {
]);
let list_value_offsets = Buffer::from(&[0, 2, 2, 4, 5, 8, 9, 12].to_byte_slice());
let expected_list_data = ArrayData::new(
- DataType::List(Box::new(DataType::Utf8)),
+ DataType::List(Box::new(Field::new("item", DataType::Utf8, true))),
7,
None,
None, // is this correct?
@@ -3918,7 +3947,10 @@ mod tests {
Some(12),
]);
let expected_list_data = ArrayData::new(
- DataType::FixedSizeList(Box::new(DataType::UInt16), 2),
+ DataType::FixedSizeList(
+ Box::new(Field::new("item", DataType::UInt16, true)),
+ 2,
+ ),
12,
None,
None,
@@ -3988,7 +4020,10 @@ mod tests {
None,
]);
let expected_list_data = ArrayData::new(
- DataType::FixedSizeList(Box::new(DataType::UInt8), 2),
+ DataType::FixedSizeList(
+ Box::new(Field::new("item", DataType::UInt8, true)),
+ 2,
+ ),
12,
None,
None,
diff --git a/rust/arrow/src/compute/kernels/cast.rs b/rust/arrow/src/compute/kernels/cast.rs
index ab34c6a..9d1b7fc 100644
--- a/rust/arrow/src/compute/kernels/cast.rs
+++ b/rust/arrow/src/compute/kernels/cast.rs
@@ -57,9 +57,11 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool {
match (from_type, to_type) {
(Struct(_), _) => false,
(_, Struct(_)) => false,
- (List(list_from), List(list_to)) => can_cast_types(list_from, list_to),
+ (List(list_from), List(list_to)) => {
+ can_cast_types(list_from.data_type(), list_to.data_type())
+ }
(List(_), _) => false,
- (_, List(list_to)) => can_cast_types(from_type, list_to),
+ (_, List(list_to)) => can_cast_types(from_type, list_to.data_type()),
(Dictionary(_, from_value_type), Dictionary(_, to_value_type)) => {
can_cast_types(from_value_type, to_value_type)
}
@@ -243,9 +245,9 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result<ArrayRef> {
(List(_), List(ref to)) => {
let data = array.data_ref();
let underlying_array = make_array(data.child_data()[0].clone());
- let cast_array = cast(&underlying_array, &to)?;
+ let cast_array = cast(&underlying_array, to.data_type())?;
let array_data = ArrayData::new(
- *to.clone(),
+ to.data_type().clone(),
array.len(),
Some(cast_array.null_count()),
cast_array
@@ -266,12 +268,12 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result<ArrayRef> {
)),
(_, List(ref to)) => {
// cast primitive to list's primitive
- let cast_array = cast(array, &to)?;
+ let cast_array = cast(array, to.data_type())?;
// create offsets, where if array.len() = 2, we have [0,1,2]
let offsets: Vec<i32> = (0..=array.len() as i32).collect();
let value_offsets = Buffer::from(offsets[..].to_byte_slice());
let list_data = ArrayData::new(
- *to.clone(),
+ to.data_type().clone(),
array.len(),
Some(cast_array.null_count()),
cast_array
@@ -1219,7 +1221,11 @@ mod tests {
fn test_cast_i32_to_list_i32() {
let a = Int32Array::from(vec![5, 6, 7, 8, 9]);
let array = Arc::new(a) as ArrayRef;
- let b = cast(&array, &DataType::List(Box::new(DataType::Int32))).unwrap();
+ let b = cast(
+ &array,
+ &DataType::List(Box::new(Field::new("item", DataType::Int32, true))),
+ )
+ .unwrap();
assert_eq!(5, b.len());
let arr = b.as_any().downcast_ref::<ListArray>().unwrap();
assert_eq!(0, arr.value_offset(0));
@@ -1245,7 +1251,11 @@ mod tests {
fn test_cast_i32_to_list_i32_nullable() {
let a = Int32Array::from(vec![Some(5), None, Some(7), Some(8), Some(9)]);
let array = Arc::new(a) as ArrayRef;
- let b = cast(&array, &DataType::List(Box::new(DataType::Int32))).unwrap();
+ let b = cast(
+ &array,
+ &DataType::List(Box::new(Field::new("item", DataType::Int32, true))),
+ )
+ .unwrap();
assert_eq!(5, b.len());
assert_eq!(1, b.null_count());
let arr = b.as_any().downcast_ref::<ListArray>().unwrap();
@@ -1274,7 +1284,11 @@ mod tests {
let a = Int32Array::from(vec![Some(5), None, Some(7), Some(8), None, Some(10)]);
let array = Arc::new(a) as ArrayRef;
let array = array.slice(2, 4);
- let b = cast(&array, &DataType::List(Box::new(DataType::Float64))).unwrap();
+ let b = cast(
+ &array,
+ &DataType::List(Box::new(Field::new("item", DataType::Float64, true))),
+ )
+ .unwrap();
assert_eq!(4, b.len());
assert_eq!(1, b.null_count());
let arr = b.as_any().downcast_ref::<ListArray>().unwrap();
@@ -1348,7 +1362,8 @@ mod tests {
let value_offsets = Buffer::from(&[0, 3, 6, 8].to_byte_slice());
// Construct a list array from the above two
- let list_data_type = DataType::List(Box::new(DataType::Int32));
+ let list_data_type =
+ DataType::List(Box::new(Field::new("item", DataType::Int32, true)));
let list_data = ArrayData::builder(list_data_type)
.len(3)
.add_buffer(value_offsets)
@@ -1356,8 +1371,11 @@ mod tests {
.build();
let list_array = Arc::new(ListArray::from(list_data)) as ArrayRef;
- let cast_array =
- cast(&list_array, &DataType::List(Box::new(DataType::UInt16))).unwrap();
+ let cast_array = cast(
+ &list_array,
+ &DataType::List(Box::new(Field::new("item", DataType::UInt16, true))),
+ )
+ .unwrap();
// 3 negative values should get lost when casting to unsigned,
// 1 value should overflow
assert_eq!(4, cast_array.null_count());
@@ -1403,7 +1421,8 @@ mod tests {
let value_offsets = Buffer::from(&[0, 3, 6, 9].to_byte_slice());
// Construct a list array from the above two
- let list_data_type = DataType::List(Box::new(DataType::Int32));
+ let list_data_type =
+ DataType::List(Box::new(Field::new("item", DataType::Int32, true)));
let list_data = ArrayData::builder(list_data_type)
.len(3)
.add_buffer(value_offsets)
@@ -1413,7 +1432,11 @@ mod tests {
cast(
&list_array,
- &DataType::List(Box::new(DataType::Timestamp(TimeUnit::Microsecond, None))),
+ &DataType::List(Box::new(Field::new(
+ "item",
+ DataType::Timestamp(TimeUnit::Microsecond, None),
+ true,
+ ))),
)
.unwrap();
}
@@ -2816,7 +2839,8 @@ mod tests {
let value_offsets = Buffer::from(&[0, 3, 6, 8].to_byte_slice());
// Construct a list array from the above two
- let list_data_type = DataType::List(Box::new(DataType::Int32));
+ let list_data_type =
+ DataType::List(Box::new(Field::new("item", DataType::Int32, true)));
let list_data = ArrayData::builder(list_data_type.clone())
.len(3)
.add_buffer(value_offsets.clone())
@@ -2837,7 +2861,8 @@ mod tests {
let value_offsets = Buffer::from(&[0i64, 3, 6, 8].to_byte_slice());
// Construct a list array from the above two
- let list_data_type = DataType::LargeList(Box::new(DataType::Int32));
+ let list_data_type =
+ DataType::LargeList(Box::new(Field::new("item", DataType::Int32, true)));
let list_data = ArrayData::builder(list_data_type.clone())
.len(3)
.add_buffer(value_offsets.clone())
@@ -2856,7 +2881,10 @@ mod tests {
.build();
// Construct a fixed size list array from the above two
- let list_data_type = DataType::FixedSizeList(Box::new(DataType::Int32), 2);
+ let list_data_type = DataType::FixedSizeList(
+ Box::new(Field::new("item", DataType::Int32, true)),
+ 2,
+ );
let list_data = ArrayData::builder(list_data_type)
.len(5)
.add_child_data(value_data.clone())
@@ -2949,12 +2977,12 @@ mod tests {
LargeBinary,
Utf8,
LargeUtf8,
- List(Box::new(DataType::Int8)),
- List(Box::new(DataType::Utf8)),
- FixedSizeList(Box::new(DataType::Int8), 10),
- FixedSizeList(Box::new(DataType::Utf8), 10),
- LargeList(Box::new(DataType::Int8)),
- LargeList(Box::new(DataType::Utf8)),
+ List(Box::new(Field::new("item", DataType::Int8, true))),
+ List(Box::new(Field::new("item", DataType::Utf8, true))),
+ FixedSizeList(Box::new(Field::new("item", DataType::Int8, true)), 10),
+ FixedSizeList(Box::new(Field::new("item", DataType::Utf8, false)), 10),
+ LargeList(Box::new(Field::new("item", DataType::Int8, true))),
+ LargeList(Box::new(Field::new("item", DataType::Utf8, false))),
Struct(vec![
Field::new("f1", DataType::Int32, false),
Field::new("f2", DataType::Utf8, true),
diff --git a/rust/arrow/src/compute/kernels/comparison.rs b/rust/arrow/src/compute/kernels/comparison.rs
index 72d0f6b..88bb499 100644
--- a/rust/arrow/src/compute/kernels/comparison.rs
+++ b/rust/arrow/src/compute/kernels/comparison.rs
@@ -681,8 +681,8 @@ fn new_all_set_buffer(len: usize) -> Buffer {
#[cfg(test)]
mod tests {
use super::*;
- use crate::array::Int32Array;
use crate::datatypes::{Int8Type, ToByteSlice};
+ use crate::{array::Int32Array, datatypes::Field};
#[test]
fn test_primitive_array_eq() {
@@ -950,7 +950,8 @@ mod tests {
])
.data();
let value_offsets = Buffer::from(&[0i64, 3, 6, 6, 9].to_byte_slice());
- let list_data_type = DataType::LargeList(Box::new(DataType::Int32));
+ let list_data_type =
+ DataType::LargeList(Box::new(Field::new("item", DataType::Int32, true)));
let list_data = ArrayData::builder(list_data_type)
.len(4)
.add_buffer(value_offsets)
diff --git a/rust/arrow/src/compute/kernels/concat.rs b/rust/arrow/src/compute/kernels/concat.rs
index 4dc9456..d07d35e 100644
--- a/rust/arrow/src/compute/kernels/concat.rs
+++ b/rust/arrow/src/compute/kernels/concat.rs
@@ -114,7 +114,9 @@ pub fn concat(array_list: &[ArrayRef]) -> Result<ArrayRef> {
DataType::Duration(TimeUnit::Nanosecond) => {
concat_primitive::<DurationNanosecondType>(array_data_list)
}
- DataType::List(nested_type) => concat_list(array_data_list, *nested_type.clone()),
+ DataType::List(nested_field) => {
+ concat_list(array_data_list, nested_field.data_type())
+ }
t => Err(ArrowError::ComputeError(format!(
"Concat not supported for data type {:?}",
t
@@ -145,7 +147,7 @@ where
#[inline]
fn concat_list(
array_data_list: &[ArrayDataRef],
- data_type: DataType,
+ data_type: &DataType,
) -> Result<ArrayRef> {
match data_type {
DataType::Int8 => concat_primitive_list::<Int8Type>(array_data_list),
diff --git a/rust/arrow/src/compute/kernels/filter.rs b/rust/arrow/src/compute/kernels/filter.rs
index 2e6ed12..05d5348 100644
--- a/rust/arrow/src/compute/kernels/filter.rs
+++ b/rust/arrow/src/compute/kernels/filter.rs
@@ -488,7 +488,7 @@ impl FilterContext {
key_type, value_type
)))
}
- DataType::List(dt) => match &**dt {
+ DataType::List(dt) => match dt.data_type() {
DataType::UInt8 => {
filter_primitive_item_list_array!(self, array, UInt8Type, ListArray, ListBuilder)
}
@@ -601,7 +601,7 @@ impl FilterContext {
)))
}
}
- DataType::LargeList(dt) => match &**dt {
+ DataType::LargeList(dt) => match dt.data_type() {
DataType::UInt8 => {
filter_primitive_item_list_array!(self, array, UInt8Type, LargeListArray, LargeListBuilder)
}
@@ -1085,7 +1085,8 @@ mod tests {
let value_offsets = Buffer::from(&[0i64, 3, 6, 8, 8].to_byte_slice());
- let list_data_type = DataType::LargeList(Box::new(DataType::Int32));
+ let list_data_type =
+ DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false)));
let list_data = ArrayData::builder(list_data_type)
.len(4)
.add_buffer(value_offsets)
diff --git a/rust/arrow/src/compute/kernels/limit.rs b/rust/arrow/src/compute/kernels/limit.rs
index 44b6c1c..65f66bc 100644
--- a/rust/arrow/src/compute/kernels/limit.rs
+++ b/rust/arrow/src/compute/kernels/limit.rs
@@ -110,7 +110,8 @@ mod tests {
bit_util::set_bit(&mut null_bits, 8);
// Construct a list array from the above two
- let list_data_type = DataType::List(Box::new(DataType::Int32));
+ let list_data_type =
+ DataType::List(Box::new(Field::new("item", DataType::Int32, false)));
let list_data = ArrayData::builder(list_data_type)
.len(9)
.add_buffer(value_offsets)
diff --git a/rust/arrow/src/compute/kernels/take.rs b/rust/arrow/src/compute/kernels/take.rs
index 9cb7f36..83139c3 100644
--- a/rust/arrow/src/compute/kernels/take.rs
+++ b/rust/arrow/src/compute/kernels/take.rs
@@ -778,7 +778,11 @@ mod tests {
let value_offsets: [$offset_type; 4] = [0, 3, 6, 8];
let value_offsets = Buffer::from(&value_offsets.to_byte_slice());
// Construct a list array from the above two
- let list_data_type = DataType::$list_data_type(Box::new(DataType::Int32));
+ let list_data_type = DataType::$list_data_type(Box::new(Field::new(
+ "item",
+ DataType::Int32,
+ false,
+ )));
let list_data = ArrayData::builder(list_data_type.clone())
.len(3)
.add_buffer(value_offsets)
@@ -847,7 +851,11 @@ mod tests {
let value_offsets: [$offset_type; 5] = [0, 3, 6, 7, 9];
let value_offsets = Buffer::from(&value_offsets.to_byte_slice());
// Construct a list array from the above two
- let list_data_type = DataType::$list_data_type(Box::new(DataType::Int32));
+ let list_data_type = DataType::$list_data_type(Box::new(Field::new(
+ "item",
+ DataType::Int32,
+ false,
+ )));
let list_data = ArrayData::builder(list_data_type.clone())
.len(4)
.add_buffer(value_offsets)
@@ -916,7 +924,11 @@ mod tests {
let value_offsets: [$offset_type; 5] = [0, 3, 6, 6, 8];
let value_offsets = Buffer::from(&value_offsets.to_byte_slice());
// Construct a list array from the above two
- let list_data_type = DataType::$list_data_type(Box::new(DataType::Int32));
+ let list_data_type = DataType::$list_data_type(Box::new(Field::new(
+ "item",
+ DataType::Int32,
+ false,
+ )));
let list_data = ArrayData::builder(list_data_type.clone())
.len(4)
.add_buffer(value_offsets)
@@ -1006,7 +1018,8 @@ mod tests {
// Construct offsets
let value_offsets = Buffer::from(&[0, 3, 6, 8].to_byte_slice());
// Construct a list array from the above two
- let list_data_type = DataType::List(Box::new(DataType::Int32));
+ let list_data_type =
+ DataType::List(Box::new(Field::new("item", DataType::Int32, false)));
let list_data = ArrayData::builder(list_data_type.clone())
.len(3)
.add_buffer(value_offsets)
diff --git a/rust/arrow/src/compute/util.rs b/rust/arrow/src/compute/util.rs
index 686cc8b..3b57801 100644
--- a/rust/arrow/src/compute/util.rs
+++ b/rust/arrow/src/compute/util.rs
@@ -321,7 +321,7 @@ mod tests {
#[test]
fn test_take_value_index_from_list() {
let list = build_list(
- DataType::List(Box::new(DataType::Int32)),
+ DataType::List(Box::new(Field::new("item", DataType::Int32, true))),
Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
vec![0i32, 2i32, 5i32, 10i32],
);
@@ -337,7 +337,7 @@ mod tests {
#[test]
fn test_take_value_index_from_large_list() {
let list = build_list(
- DataType::LargeList(Box::new(DataType::Int32)),
+ DataType::LargeList(Box::new(Field::new("item", DataType::Int32, false))),
Int32Array::from(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
vec![0i64, 2i64, 5i64, 10i64],
);
diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs
index 6f0dc24..099355e 100644
--- a/rust/arrow/src/datatypes.rs
+++ b/rust/arrow/src/datatypes.rs
@@ -125,11 +125,11 @@ pub enum DataType {
/// A variable-length string in Unicode with UFT-8 encoding and 64-bit offsets.
LargeUtf8,
/// A list of some logical data type with variable length.
- List(Box<DataType>),
+ List(Box<Field>),
/// A list of some logical data type with fixed length.
- FixedSizeList(Box<DataType>, i32),
+ FixedSizeList(Box<Field>, i32),
/// A list of some logical data type with variable length and 64-bit offsets.
- LargeList(Box<DataType>),
+ LargeList(Box<Field>),
/// A nested datatype that contains a number of sub-fields.
Struct(Vec<Field>),
/// A nested datatype that can represent slots of differing types.
@@ -189,8 +189,8 @@ pub struct Field {
name: String,
data_type: DataType,
nullable: bool,
- pub(crate) dict_id: i64,
- pub(crate) dict_is_ordered: bool,
+ dict_id: i64,
+ dict_is_ordered: bool,
}
pub trait ArrowNativeType:
@@ -874,6 +874,7 @@ impl<T: ArrowNativeType> ToByteSlice for T {
impl DataType {
/// Parse a data type from a JSON representation
fn from(json: &Value) -> Result<DataType> {
+ let default_field = Field::new("", DataType::Boolean, true);
match *json {
Value::Object(ref map) => match map.get("name") {
Some(s) if s == "null" => Ok(DataType::Null),
@@ -1007,17 +1008,17 @@ impl DataType {
},
Some(s) if s == "list" => {
// return a list with any type as its child isn't defined in the map
- Ok(DataType::List(Box::new(DataType::Boolean)))
+ Ok(DataType::List(Box::new(default_field)))
}
Some(s) if s == "largelist" => {
// return a largelist with any type as its child isn't defined in the map
- Ok(DataType::LargeList(Box::new(DataType::Boolean)))
+ Ok(DataType::LargeList(Box::new(default_field)))
}
Some(s) if s == "fixedsizelist" => {
// return a list with any type as its child isn't defined in the map
if let Some(Value::Number(size)) = map.get("listSize") {
Ok(DataType::FixedSizeList(
- Box::new(DataType::Boolean),
+ Box::new(default_field),
size.as_i64().unwrap() as i32,
))
} else {
@@ -1182,6 +1183,18 @@ impl Field {
self.nullable
}
+ /// Returns the dictionary ID
+ #[inline]
+ pub const fn dict_id(&self) -> i64 {
+ self.dict_id
+ }
+
+ /// Indicates whether this `Field`'s dictionary is ordered
+ #[inline]
+ pub const fn dict_is_ordered(&self) -> bool {
+ self.dict_is_ordered
+ }
+
/// Parse a `Field` definition from a JSON representation
pub fn from(json: &Value) -> Result<Self> {
match *json {
@@ -1223,14 +1236,14 @@ impl Field {
}
match data_type {
DataType::List(_) => DataType::List(Box::new(
- Self::from(&values[0])?.data_type,
+ Self::from(&values[0])?,
)),
DataType::LargeList(_) => DataType::LargeList(Box::new(
- Self::from(&values[0])?.data_type,
+ Self::from(&values[0])?,
)),
DataType::FixedSizeList(_, int) => {
DataType::FixedSizeList(
- Box::new(Self::from(&values[0])?.data_type),
+ Box::new(Self::from(&values[0])?),
int,
)
}
@@ -1322,18 +1335,9 @@ impl Field {
pub fn to_json(&self) -> Value {
let children: Vec<Value> = match self.data_type() {
DataType::Struct(fields) => fields.iter().map(|f| f.to_json()).collect(),
- DataType::List(dtype) => {
- let item = Field::new("item", *dtype.clone(), self.nullable);
- vec![item.to_json()]
- }
- DataType::LargeList(dtype) => {
- let item = Field::new("item", *dtype.clone(), self.nullable);
- vec![item.to_json()]
- }
- DataType::FixedSizeList(dtype, _) => {
- let item = Field::new("item", *dtype.clone(), self.nullable);
- vec![item.to_json()]
- }
+ DataType::List(field) => vec![field.to_json()],
+ DataType::LargeList(field) => vec![field.to_json()],
+ DataType::FixedSizeList(field, _) => vec![field.to_json()],
_ => vec![],
};
match self.data_type() {
@@ -1958,17 +1962,30 @@ mod tests {
),
Field::new("c19", DataType::Interval(IntervalUnit::DayTime), false),
Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false),
- Field::new("c21", DataType::List(Box::new(DataType::Boolean)), false),
+ Field::new(
+ "c21",
+ DataType::List(Box::new(Field::new("item", DataType::Boolean, true))),
+ false,
+ ),
Field::new(
"c22",
- DataType::FixedSizeList(Box::new(DataType::Boolean), 5),
+ DataType::FixedSizeList(
+ Box::new(Field::new("bools", DataType::Boolean, false)),
+ 5,
+ ),
false,
),
Field::new(
"c23",
- DataType::List(Box::new(DataType::List(Box::new(DataType::Struct(
- vec![],
- ))))),
+ DataType::List(Box::new(Field::new(
+ "inner_list",
+ DataType::List(Box::new(Field::new(
+ "struct",
+ DataType::Struct(vec![]),
+ true,
+ ))),
+ false,
+ ))),
true,
),
Field::new(
@@ -1999,9 +2016,15 @@ mod tests {
Field::new("c33", DataType::LargeUtf8, true),
Field::new(
"c34",
- DataType::LargeList(Box::new(DataType::LargeList(Box::new(
- DataType::Struct(vec![]),
- )))),
+ DataType::LargeList(Box::new(Field::new(
+ "inner_large_list",
+ DataType::LargeList(Box::new(Field::new(
+ "struct",
+ DataType::Struct(vec![]),
+ false,
+ ))),
+ true,
+ ))),
true,
),
],
@@ -2207,7 +2230,7 @@ mod tests {
"children": [
{
"name": "item",
- "nullable": false,
+ "nullable": true,
"type": {
"name": "bool"
},
@@ -2224,7 +2247,7 @@ mod tests {
},
"children": [
{
- "name": "item",
+ "name": "bools",
"nullable": false,
"type": {
"name": "bool"
@@ -2241,14 +2264,14 @@ mod tests {
},
"children": [
{
- "name": "item",
- "nullable": true,
+ "name": "inner_list",
+ "nullable": false,
"type": {
"name": "list"
},
"children": [
{
- "name": "item",
+ "name": "struct",
"nullable": true,
"type": {
"name": "struct"
@@ -2381,15 +2404,15 @@ mod tests {
},
"children": [
{
- "name": "item",
+ "name": "inner_large_list",
"nullable": true,
"type": {
"name": "largelist"
},
"children": [
{
- "name": "item",
- "nullable": true,
+ "name": "struct",
+ "nullable": false,
"type": {
"name": "struct"
},
diff --git a/rust/arrow/src/ipc/convert.rs b/rust/arrow/src/ipc/convert.rs
index 63d55f0..feb4133 100644
--- a/rust/arrow/src/ipc/convert.rs
+++ b/rust/arrow/src/ipc/convert.rs
@@ -268,30 +268,22 @@ pub(crate) fn get_data_type(field: ipc::Field, may_be_dictionary: bool) -> DataT
if children.len() != 1 {
panic!("expect a list to have one child")
}
- let child_field = children.get(0);
- // returning int16 for now, to test, not sure how to get data type
- DataType::List(Box::new(get_data_type(child_field, false)))
+ DataType::List(Box::new(children.get(0).into()))
}
ipc::Type::LargeList => {
let children = field.children().unwrap();
if children.len() != 1 {
panic!("expect a large list to have one child")
}
- let child_field = children.get(0);
- // returning int16 for now, to test, not sure how to get data type
- DataType::LargeList(Box::new(get_data_type(child_field, false)))
+ DataType::LargeList(Box::new(children.get(0).into()))
}
ipc::Type::FixedSizeList => {
let children = field.children().unwrap();
if children.len() != 1 {
panic!("expect a list to have one child")
}
- let child_field = children.get(0);
let fsl = field.type_as_fixed_size_list().unwrap();
- DataType::FixedSizeList(
- Box::new(get_data_type(child_field, false)),
- fsl.listSize(),
- )
+ DataType::FixedSizeList(Box::new(children.get(0).into()), fsl.listSize())
}
ipc::Type::Struct_ => {
let mut fields = vec![];
@@ -324,8 +316,8 @@ pub(crate) fn build_field<'a: 'b, 'b>(
let fb_dictionary = if let Dictionary(index_type, _) = field.data_type() {
Some(get_fb_dictionary(
index_type,
- field.dict_id,
- field.dict_is_ordered,
+ field.dict_id(),
+ field.dict_is_ordered(),
fbb,
))
} else {
@@ -537,19 +529,7 @@ pub(crate) fn get_fb_field_type<'a: 'b, 'b>(
}
}
List(ref list_type) => {
- let inner_types = get_fb_field_type(list_type, fbb);
- let child = ipc::Field::create(
- fbb,
- &ipc::FieldArgs {
- name: None,
- nullable: false,
- type_type: inner_types.type_type,
- type_: Some(inner_types.type_),
- children: inner_types.children,
- dictionary: None,
- custom_metadata: None,
- },
- );
+ let child = build_field(fbb, list_type);
FBFieldType {
type_type: ipc::Type::List,
type_: ipc::ListBuilder::new(fbb).finish().as_union_value(),
@@ -557,19 +537,7 @@ pub(crate) fn get_fb_field_type<'a: 'b, 'b>(
}
}
LargeList(ref list_type) => {
- let inner_types = get_fb_field_type(list_type, fbb);
- let child = ipc::Field::create(
- fbb,
- &ipc::FieldArgs {
- name: None,
- nullable: false,
- type_type: inner_types.type_type,
- type_: Some(inner_types.type_),
- dictionary: None,
- children: inner_types.children,
- custom_metadata: None,
- },
- );
+ let child = build_field(fbb, list_type);
FBFieldType {
type_type: ipc::Type::LargeList,
type_: ipc::LargeListBuilder::new(fbb).finish().as_union_value(),
@@ -577,19 +545,7 @@ pub(crate) fn get_fb_field_type<'a: 'b, 'b>(
}
}
FixedSizeList(ref list_type, len) => {
- let inner_types = get_fb_field_type(list_type, fbb);
- let child = ipc::Field::create(
- fbb,
- &ipc::FieldArgs {
- name: None,
- nullable: false,
- type_type: inner_types.type_type,
- type_: Some(inner_types.type_),
- dictionary: None,
- children: inner_types.children,
- custom_metadata: None,
- },
- );
+ let child = build_field(fbb, list_type);
let mut builder = ipc::FixedSizeListBuilder::new(fbb);
builder.add_listSize(*len as i32);
FBFieldType {
@@ -735,14 +691,22 @@ mod tests {
),
Field::new("utf8", DataType::Utf8, false),
Field::new("binary", DataType::Binary, false),
- Field::new("list[u8]", DataType::List(Box::new(DataType::UInt8)), true),
+ Field::new(
+ "list[u8]",
+ DataType::List(Box::new(Field::new("item", DataType::UInt8, false))),
+ true,
+ ),
Field::new(
"list[struct<float32, int32, bool>]",
- DataType::List(Box::new(DataType::Struct(vec![
- Field::new("float32", DataType::UInt8, false),
- Field::new("int32", DataType::Int32, true),
- Field::new("bool", DataType::Boolean, true),
- ]))),
+ DataType::List(Box::new(Field::new(
+ "struct",
+ DataType::Struct(vec![
+ Field::new("float32", DataType::UInt8, false),
+ Field::new("int32", DataType::Int32, true),
+ Field::new("bool", DataType::Boolean, true),
+ ]),
+ true,
+ ))),
false,
),
Field::new(
@@ -751,18 +715,26 @@ mod tests {
Field::new("int64", DataType::Int64, true),
Field::new(
"list[struct<date32, list[struct<>]>]",
- DataType::List(Box::new(DataType::Struct(vec![
- Field::new(
- "date32",
- DataType::Date32(DateUnit::Day),
- true,
- ),
- Field::new(
- "list[struct<>]",
- DataType::List(Box::new(DataType::Struct(vec![]))),
- false,
- ),
- ]))),
+ DataType::List(Box::new(Field::new(
+ "struct",
+ DataType::Struct(vec![
+ Field::new(
+ "date32",
+ DataType::Date32(DateUnit::Day),
+ true,
+ ),
+ Field::new(
+ "list[struct<>]",
+ DataType::List(Box::new(Field::new(
+ "struct",
+ DataType::Struct(vec![]),
+ false,
+ ))),
+ false,
+ ),
+ ]),
+ false,
+ ))),
false,
),
]),
diff --git a/rust/arrow/src/ipc/reader.rs b/rust/arrow/src/ipc/reader.rs
index e4bb003..05cf00b 100644
--- a/rust/arrow/src/ipc/reader.rs
+++ b/rust/arrow/src/ipc/reader.rs
@@ -89,7 +89,7 @@ fn create_array(
buffer_index += 2;
array
}
- List(ref list_data_type) | LargeList(ref list_data_type) => {
+ List(ref list_field) | LargeList(ref list_field) => {
let list_node = &nodes[node_index];
let list_buffers: Vec<Buffer> = buffers[buffer_index..buffer_index + 2]
.iter()
@@ -99,7 +99,7 @@ fn create_array(
buffer_index += 2;
let triple = create_array(
nodes,
- list_data_type,
+ list_field.data_type(),
data,
buffers,
dictionaries,
@@ -111,7 +111,7 @@ fn create_array(
create_list_array(list_node, data_type, &list_buffers[..], triple.0)
}
- FixedSizeList(ref list_data_type, _) => {
+ FixedSizeList(ref list_field, _) => {
let list_node = &nodes[node_index];
let list_buffers: Vec<Buffer> = buffers[buffer_index..=buffer_index]
.iter()
@@ -121,7 +121,7 @@ fn create_array(
buffer_index += 1;
let triple = create_array(
nodes,
- list_data_type,
+ list_field.data_type(),
data,
buffers,
dictionaries,
diff --git a/rust/arrow/src/json/reader.rs b/rust/arrow/src/json/reader.rs
index 42ae350..8959458 100644
--- a/rust/arrow/src/json/reader.rs
+++ b/rust/arrow/src/json/reader.rs
@@ -65,54 +65,86 @@ fn coerce_data_type(dt: Vec<&DataType>) -> Result<DataType> {
1 => Ok(dt[0].clone()),
2 => {
// there can be a case where a list and scalar both exist
- if dt.contains(&&DataType::List(Box::new(DataType::Float64)))
- || dt.contains(&&DataType::List(Box::new(DataType::Int64)))
- || dt.contains(&&DataType::List(Box::new(DataType::Boolean)))
- || dt.contains(&&DataType::List(Box::new(DataType::Utf8)))
- {
+ if dt.contains(&&DataType::List(Box::new(Field::new(
+ "item",
+ DataType::Float64,
+ true,
+ )))) || dt.contains(&&DataType::List(Box::new(Field::new(
+ "item",
+ DataType::Int64,
+ true,
+ )))) || dt.contains(&&DataType::List(Box::new(Field::new(
+ "item",
+ DataType::Boolean,
+ true,
+ )))) || dt.contains(&&DataType::List(Box::new(Field::new(
+ "item",
+ DataType::Utf8,
+ true,
+ )))) {
// we have a list and scalars, so we should get the values and coerce them
let mut dt = dt;
// sorting guarantees that the list will be the second value
dt.sort();
match (dt[0], dt[1]) {
- (t1, DataType::List(e)) if **e == DataType::Float64 => {
+ (t1, DataType::List(e)) if e.data_type() == &DataType::Float64 => {
if t1 == &DataType::Float64 {
- Ok(DataType::List(Box::new(DataType::Float64)))
+ Ok(DataType::List(Box::new(Field::new(
+ "item",
+ DataType::Float64,
+ true,
+ ))))
} else {
- Ok(DataType::List(Box::new(coerce_data_type(vec![
- t1,
- &DataType::Float64,
- ])?)))
+ Ok(DataType::List(Box::new(Field::new(
+ "item",
+ coerce_data_type(vec![t1, &DataType::Float64])?,
+ true,
+ ))))
}
}
- (t1, DataType::List(e)) if **e == DataType::Int64 => {
+ (t1, DataType::List(e)) if e.data_type() == &DataType::Int64 => {
if t1 == &DataType::Int64 {
- Ok(DataType::List(Box::new(DataType::Int64)))
+ Ok(DataType::List(Box::new(Field::new(
+ "item",
+ DataType::Int64,
+ true,
+ ))))
} else {
- Ok(DataType::List(Box::new(coerce_data_type(vec![
- t1,
- &DataType::Int64,
- ])?)))
+ Ok(DataType::List(Box::new(Field::new(
+ "item",
+ coerce_data_type(vec![t1, &DataType::Int64])?,
+ true,
+ ))))
}
}
- (t1, DataType::List(e)) if **e == DataType::Boolean => {
+ (t1, DataType::List(e)) if e.data_type() == &DataType::Boolean => {
if t1 == &DataType::Boolean {
- Ok(DataType::List(Box::new(DataType::Boolean)))
+ Ok(DataType::List(Box::new(Field::new(
+ "item",
+ DataType::Boolean,
+ true,
+ ))))
} else {
- Ok(DataType::List(Box::new(coerce_data_type(vec![
- t1,
- &DataType::Boolean,
- ])?)))
+ Ok(DataType::List(Box::new(Field::new(
+ "item",
+ coerce_data_type(vec![t1, &DataType::Boolean])?,
+ true,
+ ))))
}
}
- (t1, DataType::List(e)) if **e == DataType::Utf8 => {
+ (t1, DataType::List(e)) if e.data_type() == &DataType::Utf8 => {
if t1 == &DataType::Utf8 {
- Ok(DataType::List(Box::new(DataType::Utf8)))
+ Ok(DataType::List(Box::new(Field::new(
+ "item",
+ DataType::Utf8,
+ true,
+ ))))
} else {
- Ok(DataType::List(Box::new(coerce_data_type(vec![
- t1,
- &DataType::Utf8,
- ])?)))
+ Ok(DataType::List(Box::new(Field::new(
+ "item",
+ coerce_data_type(vec![t1, &DataType::Utf8])?,
+ true,
+ ))))
}
}
(t1, t2) => Err(ArrowError::JsonError(format!(
@@ -129,7 +161,11 @@ fn coerce_data_type(dt: Vec<&DataType>) -> Result<DataType> {
_ => {
// TODO(nevi_me) It's possible to have [float, int, list(float)], which should
// return list(float). Will hash this out later
- Ok(DataType::List(Box::new(DataType::Utf8)))
+ Ok(DataType::List(Box::new(Field::new(
+ "item",
+ DataType::Utf8,
+ true,
+ ))))
}
}
}
@@ -268,11 +304,15 @@ pub fn infer_json_schema<R: Read>(
if values.contains_key(k) {
let x = values.get_mut(k).unwrap();
- x.insert(DataType::List(Box::new(dt)));
+ x.insert(DataType::List(Box::new(
+ Field::new("item", dt, true),
+ )));
} else {
// create hashset and add value type
let mut hs = HashSet::new();
- hs.insert(DataType::List(Box::new(dt)));
+ hs.insert(DataType::List(Box::new(
+ Field::new("item", dt, true),
+ )));
values.insert(k.to_string(), hs);
}
}
@@ -563,7 +603,7 @@ impl<R: Read> Reader<R> {
Ok(Arc::new(builder.finish()) as ArrayRef)
}
DataType::List(ref t) => {
- match **t {
+ match t.data_type() {
DataType::Int8 => {
self.build_list_array::<Int8Type>(rows, field.name())
}
@@ -1352,12 +1392,12 @@ mod tests {
assert_eq!(&DataType::Int64, a.1.data_type());
let b = schema.column_with_name("b").unwrap();
assert_eq!(
- &DataType::List(Box::new(DataType::Float64)),
+ &DataType::List(Box::new(Field::new("item", DataType::Float64, true))),
b.1.data_type()
);
let c = schema.column_with_name("c").unwrap();
assert_eq!(
- &DataType::List(Box::new(DataType::Boolean)),
+ &DataType::List(Box::new(Field::new("item", DataType::Boolean, true))),
c.1.data_type()
);
let d = schema.column_with_name("d").unwrap();
@@ -1410,21 +1450,37 @@ mod tests {
use crate::datatypes::DataType::*;
assert_eq!(
- List(Box::new(Float64)),
- coerce_data_type(vec![&Float64, &List(Box::new(Float64))]).unwrap()
+ List(Box::new(Field::new("item", Float64, true))),
+ coerce_data_type(vec![
+ &Float64,
+ &List(Box::new(Field::new("item", Float64, true)))
+ ])
+ .unwrap()
);
assert_eq!(
- List(Box::new(Float64)),
- coerce_data_type(vec![&Float64, &List(Box::new(Int64))]).unwrap()
+ List(Box::new(Field::new("item", Float64, true))),
+ coerce_data_type(vec![
+ &Float64,
+ &List(Box::new(Field::new("item", Int64, true)))
+ ])
+ .unwrap()
);
assert_eq!(
- List(Box::new(Int64)),
- coerce_data_type(vec![&Int64, &List(Box::new(Int64))]).unwrap()
+ List(Box::new(Field::new("item", Int64, true))),
+ coerce_data_type(vec![
+ &Int64,
+ &List(Box::new(Field::new("item", Int64, true)))
+ ])
+ .unwrap()
);
// boolean and number are incompatible, return utf8
assert_eq!(
- List(Box::new(Utf8)),
- coerce_data_type(vec![&Boolean, &List(Box::new(Float64))]).unwrap()
+ List(Box::new(Field::new("item", Utf8, true))),
+ coerce_data_type(vec![
+ &Boolean,
+ &List(Box::new(Field::new("item", Float64, true)))
+ ])
+ .unwrap()
);
}
@@ -1455,16 +1511,19 @@ mod tests {
assert_eq!(&DataType::Int64, a.1.data_type());
let b = schema.column_with_name("b").unwrap();
assert_eq!(
- &DataType::List(Box::new(DataType::Float64)),
+ &DataType::List(Box::new(Field::new("item", DataType::Float64, true))),
b.1.data_type()
);
let c = schema.column_with_name("c").unwrap();
assert_eq!(
- &DataType::List(Box::new(DataType::Boolean)),
+ &DataType::List(Box::new(Field::new("item", DataType::Boolean, true))),
c.1.data_type()
);
let d = schema.column_with_name("d").unwrap();
- assert_eq!(&DataType::List(Box::new(DataType::Utf8)), d.1.data_type());
+ assert_eq!(
+ &DataType::List(Box::new(Field::new("item", DataType::Utf8, true))),
+ d.1.data_type()
+ );
let bb = batch
.column(b.0)
@@ -1655,9 +1714,10 @@ mod tests {
fn test_list_of_string_dictionary_from_json() {
let schema = Schema::new(vec![Field::new(
"events",
- List(Box::new(Dictionary(
- Box::new(DataType::UInt64),
- Box::new(DataType::Utf8),
+ List(Box::new(Field::new(
+ "item",
+ Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)),
+ true,
))),
true,
)]);
@@ -1678,9 +1738,10 @@ mod tests {
let events = schema.column_with_name("events").unwrap();
assert_eq!(
- &List(Box::new(Dictionary(
- Box::new(DataType::UInt64),
- Box::new(DataType::Utf8)
+ &List(Box::new(Field::new(
+ "item",
+ Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)),
+ true
))),
events.1.data_type()
);
@@ -1712,9 +1773,10 @@ mod tests {
fn test_list_of_string_dictionary_from_json_with_nulls() {
let schema = Schema::new(vec![Field::new(
"events",
- List(Box::new(Dictionary(
- Box::new(DataType::UInt64),
- Box::new(DataType::Utf8),
+ List(Box::new(Field::new(
+ "item",
+ Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)),
+ true,
))),
true,
)]);
@@ -1737,9 +1799,10 @@ mod tests {
let events = schema.column_with_name("events").unwrap();
assert_eq!(
- &List(Box::new(Dictionary(
- Box::new(DataType::UInt64),
- Box::new(DataType::Utf8)
+ &List(Box::new(Field::new(
+ "item",
+ Dictionary(Box::new(DataType::UInt64), Box::new(DataType::Utf8)),
+ true
))),
events.1.data_type()
);
@@ -1876,9 +1939,21 @@ mod tests {
fn test_json_infer_schema() {
let schema = Schema::new(vec![
Field::new("a", DataType::Int64, true),
- Field::new("b", DataType::List(Box::new(DataType::Float64)), true),
- Field::new("c", DataType::List(Box::new(DataType::Boolean)), true),
- Field::new("d", DataType::List(Box::new(DataType::Utf8)), true),
+ Field::new(
+ "b",
+ DataType::List(Box::new(Field::new("item", DataType::Float64, true))),
+ true,
+ ),
+ Field::new(
+ "c",
+ DataType::List(Box::new(Field::new("item", DataType::Boolean, true))),
+ true,
+ ),
+ Field::new(
+ "d",
+ DataType::List(Box::new(Field::new("item", DataType::Utf8, true))),
+ true,
+ ),
]);
let mut reader =
diff --git a/rust/arrow/src/record_batch.rs b/rust/arrow/src/record_batch.rs
index 484c9ff..b4aa97d 100644
--- a/rust/arrow/src/record_batch.rs
+++ b/rust/arrow/src/record_batch.rs
@@ -99,6 +99,7 @@ impl RecordBatch {
"all columns in a record batch must have the same length".to_string(),
));
}
+ // list types can have different names, but we only need the data types to be the same
if column.data_type() != schema.field(i).data_type() {
return Err(ArrowError::InvalidArgumentError(format!(
"column types must match schema types, expected {:?} but found {:?} at column index {}",
diff --git a/rust/arrow/src/util/integration_util.rs b/rust/arrow/src/util/integration_util.rs
index c1bba13..22d2716 100644
--- a/rust/arrow/src/util/integration_util.rs
+++ b/rust/arrow/src/util/integration_util.rs
@@ -375,9 +375,9 @@ impl ArrowJsonBatch {
/// Convert an Arrow JSON column/array into a vector of `Value`
fn json_from_col(col: &ArrowJsonColumn, data_type: &DataType) -> Vec<Value> {
match data_type {
- DataType::List(dt) => json_from_list_col(col, &**dt),
- DataType::FixedSizeList(dt, list_size) => {
- json_from_fixed_size_list_col(col, &**dt, *list_size as usize)
+ DataType::List(field) => json_from_list_col(col, field.data_type()),
+ DataType::FixedSizeList(field, list_size) => {
+ json_from_fixed_size_list_col(col, field.data_type(), *list_size as usize)
}
DataType::Struct(fields) => json_from_struct_col(col, fields),
DataType::Int64
@@ -474,7 +474,7 @@ fn json_from_list_col(col: &ArrowJsonColumn, data_type: &DataType) -> Vec<Value>
})
.collect();
let inner = match data_type {
- DataType::List(ref dt) => json_from_col(child, &**dt),
+ DataType::List(ref field) => json_from_col(child, field.data_type()),
DataType::Struct(fields) => json_from_struct_col(col, fields),
_ => merge_json_array(
child.validity.as_ref().unwrap().as_slice(),
@@ -511,8 +511,8 @@ fn json_from_fixed_size_list_col(
// get the inner array
let child = &col.children.clone().expect("list type must have children")[0];
let inner = match data_type {
- DataType::List(ref dt) => json_from_col(child, &**dt),
- DataType::FixedSizeList(ref dt, _) => json_from_col(child, &**dt),
+ DataType::List(ref field) => json_from_col(child, field.data_type()),
+ DataType::FixedSizeList(ref field, _) => json_from_col(child, field.data_type()),
DataType::Struct(fields) => json_from_struct_col(col, fields),
_ => merge_json_array(
child.validity.as_ref().unwrap().as_slice(),
@@ -577,13 +577,13 @@ mod tests {
"nullable": true,
"children": [
{
- "name": "item",
+ "name": "custom_item",
"type": {
"name": "int",
"isSigned": true,
"bitWidth": 32
},
- "nullable": true,
+ "nullable": false,
"children": []
}
]
@@ -595,7 +595,15 @@ mod tests {
Field::new("c1", DataType::Int32, true),
Field::new("c2", DataType::Float64, true),
Field::new("c3", DataType::Utf8, true),
- Field::new("c4", DataType::List(Box::new(DataType::Int32)), true),
+ Field::new(
+ "c4",
+ DataType::List(Box::new(Field::new(
+ "custom_item",
+ DataType::Int32,
+ false,
+ ))),
+ true,
+ ),
]);
assert!(json_schema.equals_schema(&schema));
}
@@ -661,7 +669,11 @@ mod tests {
true,
),
Field::new("utf8s", DataType::Utf8, true),
- Field::new("lists", DataType::List(Box::new(DataType::Int32)), true),
+ Field::new(
+ "lists",
+ DataType::List(Box::new(Field::new("item", DataType::Int32, true))),
+ true,
+ ),
Field::new(
"structs",
DataType::Struct(vec![
@@ -735,7 +747,8 @@ mod tests {
let value_data = Int32Array::from(vec![None, Some(2), None, None]);
let value_offsets = Buffer::from(&[0, 3, 4, 4].to_byte_slice());
- let list_data_type = DataType::List(Box::new(DataType::Int32));
+ let list_data_type =
+ DataType::List(Box::new(Field::new("item", DataType::Int32, true)));
let list_data = ArrayData::builder(list_data_type)
.len(3)
.add_buffer(value_offsets)
diff --git a/rust/arrow/test/data/integration.json b/rust/arrow/test/data/integration.json
index d542779..193636f 100644
--- a/rust/arrow/test/data/integration.json
+++ b/rust/arrow/test/data/integration.json
@@ -251,19 +251,19 @@
},
{
"name": "lists",
+ "nullable": true,
"type": {
"name": "list"
},
- "nullable": true,
"children": [
{
"name": "item",
+ "nullable": true,
"type": {
"name": "int",
- "isSigned": true,
- "bitWidth": 32
+ "bitWidth": 32,
+ "isSigned": true
},
- "nullable": true,
"children": []
}
]
diff --git a/rust/datafusion/src/physical_plan/distinct_expressions.rs b/rust/datafusion/src/physical_plan/distinct_expressions.rs
index c2183ca..ed90cb3 100644
--- a/rust/datafusion/src/physical_plan/distinct_expressions.rs
+++ b/rust/datafusion/src/physical_plan/distinct_expressions.rs
@@ -80,7 +80,7 @@ impl AggregateExpr for DistinctCount {
.map(|data_type| {
Field::new(
&format_state_name(&self.name, "count distinct"),
- DataType::List(Box::new(data_type.clone())),
+ DataType::List(Box::new(Field::new("item", data_type.clone(), true))),
false,
)
})
diff --git a/rust/datafusion/src/physical_plan/functions.rs b/rust/datafusion/src/physical_plan/functions.rs
index afc4690..d0a1211 100644
--- a/rust/datafusion/src/physical_plan/functions.rs
+++ b/rust/datafusion/src/physical_plan/functions.rs
@@ -42,7 +42,7 @@ use arrow::{
array::ArrayRef,
compute::kernels::length::length,
datatypes::TimeUnit,
- datatypes::{DataType, Schema},
+ datatypes::{DataType, Field, Schema},
record_batch::RecordBatch,
};
use fmt::{Debug, Formatter};
@@ -203,7 +203,7 @@ pub fn return_type(
Ok(DataType::Timestamp(TimeUnit::Nanosecond, None))
}
BuiltinScalarFunction::Array => Ok(DataType::FixedSizeList(
- Box::new(arg_types[0].clone()),
+ Box::new(Field::new("item", arg_types[0].clone(), true)),
arg_types.len() as i32,
)),
_ => Ok(DataType::Float64),
@@ -471,7 +471,7 @@ mod tests {
assert_eq!(
expr.data_type(&schema)?,
// type equals to a common coercion
- DataType::FixedSizeList(Box::new(expected_type), 2)
+ DataType::FixedSizeList(Box::new(Field::new("item", expected_type, true)), 2)
);
// evaluate works
diff --git a/rust/datafusion/src/scalar.rs b/rust/datafusion/src/scalar.rs
index 83701e5..2eb1d69 100644
--- a/rust/datafusion/src/scalar.rs
+++ b/rust/datafusion/src/scalar.rs
@@ -28,7 +28,10 @@ use arrow::array::{
Int16Builder, Int32Builder, Int64Builder, Int8Builder, ListBuilder, UInt16Builder,
UInt32Builder, UInt64Builder, UInt8Builder,
};
-use arrow::{array::ArrayRef, datatypes::DataType};
+use arrow::{
+ array::ArrayRef,
+ datatypes::{DataType, Field},
+};
use crate::error::{DataFusionError, Result};
@@ -124,7 +127,7 @@ impl ScalarValue {
ScalarValue::Utf8(_) => DataType::Utf8,
ScalarValue::LargeUtf8(_) => DataType::LargeUtf8,
ScalarValue::List(_, data_type) => {
- DataType::List(Box::new(data_type.clone()))
+ DataType::List(Box::new(Field::new("item", data_type.clone(), true)))
}
}
}
@@ -212,7 +215,7 @@ impl ScalarValue {
Some(scalar_vec)
}
};
- ScalarValue::List(value, *nested_type.clone())
+ ScalarValue::List(value, nested_type.data_type().clone())
}
other => {
return Err(DataFusionError::NotImplemented(format!(
@@ -309,7 +312,7 @@ impl TryFrom<&DataType> for ScalarValue {
&DataType::Utf8 => ScalarValue::Utf8(None),
&DataType::LargeUtf8 => ScalarValue::LargeUtf8(None),
&DataType::List(ref nested_type) => {
- ScalarValue::List(None, *nested_type.clone())
+ ScalarValue::List(None, nested_type.data_type().clone())
}
_ => {
return Err(DataFusionError::NotImplemented(format!(
diff --git a/rust/datafusion/tests/sql.rs b/rust/datafusion/tests/sql.rs
index a35476d..8dd7afa 100644
--- a/rust/datafusion/tests/sql.rs
+++ b/rust/datafusion/tests/sql.rs
@@ -142,10 +142,14 @@ async fn parquet_list_columns() {
let schema = Arc::new(Schema::new(vec![
Field::new(
"int64_list",
- DataType::List(Box::new(DataType::Int64)),
+ DataType::List(Box::new(Field::new("item", DataType::Int64, true))),
+ true,
+ ),
+ Field::new(
+ "utf8_list",
+ DataType::List(Box::new(Field::new("item", DataType::Utf8, true))),
true,
),
- Field::new("utf8_list", DataType::List(Box::new(DataType::Utf8)), true),
]));
let sql = "SELECT int64_list, utf8_list FROM list_columns";
diff --git a/rust/parquet/src/arrow/array_reader.rs b/rust/parquet/src/arrow/array_reader.rs
index a0026f3..a455e48 100644
--- a/rust/parquet/src/arrow/array_reader.rs
+++ b/rust/parquet/src/arrow/array_reader.rs
@@ -1348,7 +1348,11 @@ impl<'a> TypeVisitor<Option<Box<dyn ArrayReader>>, &'a ArrayReaderBuilderContext
.ok()
.map(|f| f.data_type().to_owned())
.unwrap_or_else(|| {
- ArrowType::List(Box::new(item_reader_type.clone()))
+ ArrowType::List(Box::new(Field::new(
+ list_type.name(),
+ item_reader_type.clone(),
+ list_type.is_optional(),
+ )))
});
let list_array_reader: Box<dyn ArrayReader> = match arrow_type {
@@ -2309,7 +2313,7 @@ mod tests {
let mut list_array_reader = ListArrayReader::<i32>::new(
Box::new(item_array_reader),
- ArrowType::List(Box::new(ArrowType::Int32)),
+ ArrowType::List(Box::new(Field::new("item", ArrowType::Int32, true))),
ArrowType::Int32,
1,
1,
@@ -2363,7 +2367,7 @@ mod tests {
let mut list_array_reader = ListArrayReader::<i64>::new(
Box::new(item_array_reader),
- ArrowType::LargeList(Box::new(ArrowType::Int32)),
+ ArrowType::LargeList(Box::new(Field::new("item", ArrowType::Int32, true))),
ArrowType::Int32,
1,
1,
diff --git a/rust/parquet/src/arrow/arrow_writer.rs b/rust/parquet/src/arrow/arrow_writer.rs
index de8bc7c..f2279fb 100644
--- a/rust/parquet/src/arrow/arrow_writer.rs
+++ b/rust/parquet/src/arrow/arrow_writer.rs
@@ -722,7 +722,7 @@ mod tests {
// define schema
let schema = Schema::new(vec![Field::new(
"a",
- DataType::List(Box::new(DataType::Int32)),
+ DataType::List(Box::new(Field::new("item", DataType::Int32, true))),
false,
)]);
@@ -735,11 +735,15 @@ mod tests {
arrow::buffer::Buffer::from(&[0, 1, 3, 3, 6, 10].to_byte_slice());
// Construct a list array from the above two
- let a_list_data = ArrayData::builder(DataType::List(Box::new(DataType::Int32)))
- .len(5)
- .add_buffer(a_value_offsets)
- .add_child_data(a_values.data())
- .build();
+ let a_list_data = ArrayData::builder(DataType::List(Box::new(Field::new(
+ "items",
+ DataType::Int32,
+ true,
+ ))))
+ .len(5)
+ .add_buffer(a_value_offsets)
+ .add_child_data(a_values.data())
+ .build();
let a = ListArray::from(a_list_data);
// build a record batch
@@ -816,8 +820,11 @@ mod tests {
// define schema
let struct_field_d = Field::new("d", DataType::Float64, true);
let struct_field_f = Field::new("f", DataType::Float32, true);
- let struct_field_g =
- Field::new("g", DataType::List(Box::new(DataType::Int16)), false);
+ let struct_field_g = Field::new(
+ "g",
+ DataType::List(Box::new(Field::new("items", DataType::Int16, false))),
+ false,
+ );
let struct_field_e = Field::new(
"e",
DataType::Struct(vec![struct_field_f.clone(), struct_field_g.clone()]),
@@ -1251,11 +1258,15 @@ mod tests {
let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
let a_value_offsets =
arrow::buffer::Buffer::from(&[0, 1, 3, 3, 6, 10].to_byte_slice());
- let a_list_data = ArrayData::builder(DataType::List(Box::new(DataType::Int32)))
- .len(5)
- .add_buffer(a_value_offsets)
- .add_child_data(a_values.data())
- .build();
+ let a_list_data = ArrayData::builder(DataType::List(Box::new(Field::new(
+ "item",
+ DataType::Int32,
+ true,
+ ))))
+ .len(5)
+ .add_buffer(a_value_offsets)
+ .add_child_data(a_values.data())
+ .build();
// I think this setup is incorrect because this should pass
assert_eq!(a_list_data.null_count(), 1);
@@ -1272,12 +1283,15 @@ mod tests {
let a_values = Int32Array::from(vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
let a_value_offsets =
arrow::buffer::Buffer::from(&[0i64, 1, 3, 3, 6, 10].to_byte_slice());
- let a_list_data =
- ArrayData::builder(DataType::LargeList(Box::new(DataType::Int32)))
- .len(5)
- .add_buffer(a_value_offsets)
- .add_child_data(a_values.data())
- .build();
+ let a_list_data = ArrayData::builder(DataType::LargeList(Box::new(Field::new(
+ "large_item",
+ DataType::Int32,
+ true,
+ ))))
+ .len(5)
+ .add_buffer(a_value_offsets)
+ .add_child_data(a_values.data())
+ .build();
// I think this setup is incorrect because this should pass
assert_eq!(a_list_data.null_count(), 1);
diff --git a/rust/parquet/src/arrow/schema.rs b/rust/parquet/src/arrow/schema.rs
index 10270ff..c84e895 100644
--- a/rust/parquet/src/arrow/schema.rs
+++ b/rust/parquet/src/arrow/schema.rs
@@ -406,22 +406,18 @@ fn arrow_to_parquet_type(field: &Field) -> Result<Type> {
.with_repetition(repetition)
.build()
}
- DataType::List(dtype)
- | DataType::FixedSizeList(dtype, _)
- | DataType::LargeList(dtype) => Type::group_type_builder(name)
- .with_fields(&mut vec![Rc::new(
- Type::group_type_builder("list")
- .with_fields(&mut vec![Rc::new({
- let list_field =
- Field::new("element", *dtype.clone(), field.is_nullable());
- arrow_to_parquet_type(&list_field)?
- })])
- .with_repetition(Repetition::REPEATED)
- .build()?,
- )])
- .with_logical_type(LogicalType::LIST)
- .with_repetition(Repetition::REQUIRED)
- .build(),
+ DataType::List(f) | DataType::FixedSizeList(f, _) | DataType::LargeList(f) => {
+ Type::group_type_builder(name)
+ .with_fields(&mut vec![Rc::new(
+ Type::group_type_builder("list")
+ .with_fields(&mut vec![Rc::new(arrow_to_parquet_type(f)?)])
+ .with_repetition(Repetition::REPEATED)
+ .build()?,
+ )])
+ .with_logical_type(LogicalType::LIST)
+ .with_repetition(Repetition::REQUIRED)
+ .build()
+ }
DataType::Struct(fields) => {
if fields.is_empty() {
return Err(ArrowError(
@@ -536,7 +532,11 @@ impl ParquetTypeConverter<'_> {
if self.is_self_included() {
self.to_primitive_type_inner().map(|dt| {
if self.is_repeated() {
- Some(DataType::List(Box::new(dt)))
+ Some(DataType::List(Box::new(Field::new(
+ self.schema.name(),
+ dt,
+ self.is_nullable(),
+ ))))
} else {
Some(dt)
}
@@ -630,8 +630,15 @@ impl ParquetTypeConverter<'_> {
/// This function takes care of logical type and repetition.
fn to_group_type(&self) -> Result<Option<DataType>> {
if self.is_repeated() {
- self.to_struct()
- .map(|opt| opt.map(|dt| DataType::List(Box::new(dt))))
+ self.to_struct().map(|opt| {
+ opt.map(|dt| {
+ DataType::List(Box::new(Field::new(
+ self.schema.name(),
+ dt,
+ self.is_nullable(),
+ )))
+ })
+ })
} else {
match self.schema.get_basic_info().logical_type() {
LogicalType::LIST => self.to_list(),
@@ -716,7 +723,15 @@ impl ParquetTypeConverter<'_> {
}
};
- item_type.map(|opt| opt.map(|dt| DataType::List(Box::new(dt))))
+ item_type.map(|opt| {
+ opt.map(|dt| {
+ DataType::List(Box::new(Field::new(
+ list_item.name(),
+ dt,
+ list_item.is_optional(),
+ )))
+ })
+ })
}
_ => Err(ArrowError(
"Group element type of list can only contain one field.".to_string(),
@@ -890,7 +905,7 @@ mod tests {
{
arrow_fields.push(Field::new(
"my_list",
- DataType::List(Box::new(DataType::Utf8)),
+ DataType::List(Box::new(Field::new("list", DataType::Utf8, true))),
false,
));
}
@@ -904,7 +919,7 @@ mod tests {
{
arrow_fields.push(Field::new(
"my_list",
- DataType::List(Box::new(DataType::Utf8)),
+ DataType::List(Box::new(Field::new("list", DataType::Utf8, true))),
true,
));
}
@@ -922,10 +937,11 @@ mod tests {
// }
// }
{
- let arrow_inner_list = DataType::List(Box::new(DataType::Int32));
+ let arrow_inner_list =
+ DataType::List(Box::new(Field::new("list", DataType::Int32, true)));
arrow_fields.push(Field::new(
"array_of_arrays",
- DataType::List(Box::new(arrow_inner_list)),
+ DataType::List(Box::new(Field::new("list", arrow_inner_list, true))),
true,
));
}
@@ -939,7 +955,7 @@ mod tests {
{
arrow_fields.push(Field::new(
"my_list",
- DataType::List(Box::new(DataType::Utf8)),
+ DataType::List(Box::new(Field::new("element", DataType::Utf8, true))),
true,
));
}
@@ -951,7 +967,7 @@ mod tests {
{
arrow_fields.push(Field::new(
"my_list",
- DataType::List(Box::new(DataType::Int32)),
+ DataType::List(Box::new(Field::new("element", DataType::Int32, true))),
true,
));
}
@@ -970,7 +986,7 @@ mod tests {
]);
arrow_fields.push(Field::new(
"my_list",
- DataType::List(Box::new(arrow_struct)),
+ DataType::List(Box::new(Field::new("element", arrow_struct, true))),
true,
));
}
@@ -987,7 +1003,7 @@ mod tests {
DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]);
arrow_fields.push(Field::new(
"my_list",
- DataType::List(Box::new(arrow_struct)),
+ DataType::List(Box::new(Field::new("array", arrow_struct, true))),
true,
));
}
@@ -1004,7 +1020,7 @@ mod tests {
DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]);
arrow_fields.push(Field::new(
"my_list",
- DataType::List(Box::new(arrow_struct)),
+ DataType::List(Box::new(Field::new("my_list_tuple", arrow_struct, true))),
true,
));
}
@@ -1014,7 +1030,7 @@ mod tests {
{
arrow_fields.push(Field::new(
"name",
- DataType::List(Box::new(DataType::Int32)),
+ DataType::List(Box::new(Field::new("name", DataType::Int32, true))),
true,
));
}
@@ -1180,20 +1196,24 @@ mod tests {
let inner_group_list = Field::new(
"innerGroup",
- DataType::List(Box::new(DataType::Struct(vec![Field::new(
- "leaf3",
- DataType::Int32,
+ DataType::List(Box::new(Field::new(
+ "innerGroup",
+ DataType::Struct(vec![Field::new("leaf3", DataType::Int32, true)]),
true,
- )]))),
+ ))),
true,
);
let outer_group_list = Field::new(
"outerGroup",
- DataType::List(Box::new(DataType::Struct(vec![
- Field::new("leaf2", DataType::Int32, true),
- inner_group_list,
- ]))),
+ DataType::List(Box::new(Field::new(
+ "outerGroup",
+ DataType::Struct(vec![
+ Field::new("leaf2", DataType::Int32, true),
+ inner_group_list,
+ ]),
+ true,
+ ))),
true,
);
arrow_fields.push(outer_group_list);
@@ -1261,7 +1281,11 @@ mod tests {
Field::new("double", DataType::Float64, true),
Field::new("float", DataType::Float32, true),
Field::new("string", DataType::Utf8, true),
- Field::new("bools", DataType::List(Box::new(DataType::Boolean)), true),
+ Field::new(
+ "bools",
+ DataType::List(Box::new(Field::new("bools", DataType::Boolean, true))),
+ true,
+ ),
Field::new("date", DataType::Date32(DateUnit::Day), true),
Field::new("time_milli", DataType::Time32(TimeUnit::Millisecond), true),
Field::new("time_micro", DataType::Time64(TimeUnit::Microsecond), true),
@@ -1327,7 +1351,11 @@ mod tests {
Field::new("double", DataType::Float64, true),
Field::new("float", DataType::Float32, true),
Field::new("string", DataType::Utf8, true),
- Field::new("bools", DataType::List(Box::new(DataType::Boolean)), true),
+ Field::new(
+ "bools",
+ DataType::List(Box::new(Field::new("element", DataType::Boolean, true))),
+ true,
+ ),
Field::new("date", DataType::Date32(DateUnit::Day), true),
Field::new("time_milli", DataType::Time32(TimeUnit::Millisecond), true),
Field::new("time_micro", DataType::Time64(TimeUnit::Microsecond), true),
@@ -1346,7 +1374,15 @@ mod tests {
DataType::Struct(vec![
Field::new("bools", DataType::Boolean, false),
Field::new("uint32", DataType::UInt32, false),
- Field::new("int32", DataType::List(Box::new(DataType::Int32)), true),
+ Field::new(
+ "int32",
+ DataType::List(Box::new(Field::new(
+ "element",
+ DataType::Int32,
+ true,
+ ))),
+ true,
+ ),
]),
false,
),
@@ -1452,7 +1488,11 @@ mod tests {
),
Field::new("c19", DataType::Interval(IntervalUnit::DayTime), false),
Field::new("c20", DataType::Interval(IntervalUnit::YearMonth), false),
- Field::new("c21", DataType::List(Box::new(DataType::Boolean)), false),
+ Field::new(
+ "c21",
+ DataType::List(Box::new(Field::new("list", DataType::Boolean, true))),
+ false,
+ ),
// Field::new(
// "c22",
// DataType::FixedSizeList(Box::new(DataType::Boolean), 5),
@@ -1543,20 +1583,37 @@ mod tests {
let schema = Schema::new_with_metadata(
vec![
- Field::new("c21", DataType::List(Box::new(DataType::Boolean)), false),
+ Field::new(
+ "c21",
+ DataType::List(Box::new(Field::new(
+ "array",
+ DataType::Boolean,
+ true,
+ ))),
+ false,
+ ),
Field::new(
"c22",
- DataType::FixedSizeList(Box::new(DataType::Boolean), 5),
+ DataType::FixedSizeList(
+ Box::new(Field::new("items", DataType::Boolean, false)),
+ 5,
+ ),
false,
),
Field::new(
"c23",
- DataType::List(Box::new(DataType::LargeList(Box::new(
- DataType::Struct(vec![
- Field::new("a", DataType::Int16, true),
- Field::new("b", DataType::Float64, false),
- ]),
- )))),
+ DataType::List(Box::new(Field::new(
+ "items",
+ DataType::LargeList(Box::new(Field::new(
+ "items",
+ DataType::Struct(vec![
+ Field::new("a", DataType::Int16, true),
+ Field::new("b", DataType::Float64, false),
+ ]),
+ true,
+ ))),
+ true,
+ ))),
true,
),
],