You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/06/06 16:03:19 UTC
[arrow-rs] branch master updated: Write validity buffer for UnionArray in V4 IPC message (#1794)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new e3191e739 Write validity buffer for UnionArray in V4 IPC message (#1794)
e3191e739 is described below
commit e3191e739ae0f029057b71a497c0f90961d39158
Author: Liang-Chi Hsieh <vi...@gmail.com>
AuthorDate: Mon Jun 6 09:03:14 2022 -0700
Write validity buffer for UnionArray in V4 IPC message (#1794)
* Write validity buffer for Union Array in V4 IPC message
* Add test
* Fix clippy
* Fix clippy
---
arrow/src/ipc/writer.rs | 86 +++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 80 insertions(+), 6 deletions(-)
diff --git a/arrow/src/ipc/writer.rs b/arrow/src/ipc/writer.rs
index 70e07acae..120eb7ab9 100644
--- a/arrow/src/ipc/writer.rs
+++ b/arrow/src/ipc/writer.rs
@@ -336,6 +336,7 @@ impl IpcDataGenerator {
offset,
array.len(),
array.null_count(),
+ write_options,
);
}
@@ -389,6 +390,7 @@ impl IpcDataGenerator {
0,
array_data.len(),
array_data.null_count(),
+ write_options,
);
// write data
@@ -849,7 +851,18 @@ fn write_continuation<W: Write>(
Ok(written)
}
+/// In V4, null types have no validity bitmap
+/// In V5 and later, null and union types have no validity bitmap
+fn has_validity_bitmap(data_type: &DataType, write_options: &IpcWriteOptions) -> bool {
+ if write_options.metadata_version < ipc::MetadataVersion::V5 {
+ !matches!(data_type, DataType::Null)
+ } else {
+ !matches!(data_type, DataType::Null | DataType::Union(_, _, _))
+ }
+}
+
/// Write array data to a vector of bytes
+#[allow(clippy::too_many_arguments)]
fn write_array_data(
array_data: &ArrayData,
buffers: &mut Vec<ipc::Buffer>,
@@ -858,6 +871,7 @@ fn write_array_data(
offset: i64,
num_rows: usize,
null_count: usize,
+ write_options: &IpcWriteOptions,
) -> i64 {
let mut offset = offset;
if !matches!(array_data.data_type(), DataType::Null) {
@@ -867,12 +881,7 @@ fn write_array_data(
// where null_count is always 0.
nodes.push(ipc::FieldNode::new(num_rows as i64, num_rows as i64));
}
- // NullArray does not have any buffers, thus the null buffer is not generated
- // UnionArray does not have a validity buffer
- if !matches!(
- array_data.data_type(),
- DataType::Null | DataType::Union(_, _, _)
- ) {
+ if has_validity_bitmap(array_data.data_type(), write_options) {
// write null buffer if exists
let null_buffer = match array_data.null_buffer() {
None => {
@@ -904,6 +913,7 @@ fn write_array_data(
offset,
data_ref.len(),
data_ref.null_count(),
+ write_options,
);
});
}
@@ -1433,4 +1443,68 @@ mod tests {
},
);
}
+
+ fn write_union_file(options: IpcWriteOptions) {
+ let schema = Schema::new(vec![Field::new(
+ "union",
+ DataType::Union(
+ vec![
+ Field::new("a", DataType::Int32, false),
+ Field::new("c", DataType::Float64, false),
+ ],
+ vec![0, 1],
+ UnionMode::Sparse,
+ ),
+ true,
+ )]);
+ let mut builder = UnionBuilder::new_sparse(5);
+ builder.append::<Int32Type>("a", 1).unwrap();
+ builder.append_null::<Int32Type>("a").unwrap();
+ builder.append::<Float64Type>("c", 3.0).unwrap();
+ builder.append_null::<Float64Type>("c").unwrap();
+ builder.append::<Int32Type>("a", 4).unwrap();
+ let union = builder.build().unwrap();
+
+ let batch = RecordBatch::try_new(
+ Arc::new(schema.clone()),
+ vec![Arc::new(union) as ArrayRef],
+ )
+ .unwrap();
+ let file_name = "target/debug/testdata/union.arrow_file";
+ {
+ let file = File::create(&file_name).unwrap();
+ let mut writer =
+ FileWriter::try_new_with_options(file, &schema, options).unwrap();
+
+ writer.write(&batch).unwrap();
+ writer.finish().unwrap();
+ }
+
+ {
+ let file = File::open(&file_name).unwrap();
+ let reader = FileReader::try_new(file, None).unwrap();
+ reader.for_each(|maybe_batch| {
+ maybe_batch
+ .unwrap()
+ .columns()
+ .iter()
+ .zip(batch.columns())
+ .for_each(|(a, b)| {
+ assert_eq!(a.data_type(), b.data_type());
+ assert_eq!(a.len(), b.len());
+ assert_eq!(a.null_count(), b.null_count());
+ });
+ });
+ }
+ }
+
+ #[test]
+ fn test_write_union_file_v4_v5() {
+ write_union_file(
+ IpcWriteOptions::try_new(8, false, MetadataVersion::V4).unwrap(),
+ );
+ write_union_file(
+ IpcWriteOptions::try_new(8, false, MetadataVersion::V5).unwrap(),
+ );
+ }
}