You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/06/06 16:03:19 UTC

[arrow-rs] branch master updated: Write validity buffer for UnionArray in V4 IPC message (#1794)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new e3191e739 Write validity buffer for UnionArray in V4 IPC message (#1794)
e3191e739 is described below

commit e3191e739ae0f029057b71a497c0f90961d39158
Author: Liang-Chi Hsieh <vi...@gmail.com>
AuthorDate: Mon Jun 6 09:03:14 2022 -0700

    Write validity buffer for UnionArray in V4 IPC message (#1794)
    
    * Write validity buffer for Union Array in V4 IPC message
    
    * Add test
    
    * Fix clippy
    
    * Fix clippy
---
 arrow/src/ipc/writer.rs | 86 +++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 80 insertions(+), 6 deletions(-)

diff --git a/arrow/src/ipc/writer.rs b/arrow/src/ipc/writer.rs
index 70e07acae..120eb7ab9 100644
--- a/arrow/src/ipc/writer.rs
+++ b/arrow/src/ipc/writer.rs
@@ -336,6 +336,7 @@ impl IpcDataGenerator {
                 offset,
                 array.len(),
                 array.null_count(),
+                write_options,
             );
         }
 
@@ -389,6 +390,7 @@ impl IpcDataGenerator {
             0,
             array_data.len(),
             array_data.null_count(),
+            write_options,
         );
 
         // write data
@@ -849,7 +851,18 @@ fn write_continuation<W: Write>(
     Ok(written)
 }
 
+/// In V4, null types have no validity bitmap
+/// In V5 and later, null and union types have no validity bitmap
+fn has_validity_bitmap(data_type: &DataType, write_options: &IpcWriteOptions) -> bool {
+    if write_options.metadata_version < ipc::MetadataVersion::V5 {
+        !matches!(data_type, DataType::Null)
+    } else {
+        !matches!(data_type, DataType::Null | DataType::Union(_, _, _))
+    }
+}
+
 /// Write array data to a vector of bytes
+#[allow(clippy::too_many_arguments)]
 fn write_array_data(
     array_data: &ArrayData,
     buffers: &mut Vec<ipc::Buffer>,
@@ -858,6 +871,7 @@ fn write_array_data(
     offset: i64,
     num_rows: usize,
     null_count: usize,
+    write_options: &IpcWriteOptions,
 ) -> i64 {
     let mut offset = offset;
     if !matches!(array_data.data_type(), DataType::Null) {
@@ -867,12 +881,7 @@ fn write_array_data(
         // where null_count is always 0.
         nodes.push(ipc::FieldNode::new(num_rows as i64, num_rows as i64));
     }
-    // NullArray does not have any buffers, thus the null buffer is not generated
-    // UnionArray does not have a validity buffer
-    if !matches!(
-        array_data.data_type(),
-        DataType::Null | DataType::Union(_, _, _)
-    ) {
+    if has_validity_bitmap(array_data.data_type(), write_options) {
         // write null buffer if exists
         let null_buffer = match array_data.null_buffer() {
             None => {
@@ -904,6 +913,7 @@ fn write_array_data(
                 offset,
                 data_ref.len(),
                 data_ref.null_count(),
+                write_options,
             );
         });
     }
@@ -1433,4 +1443,68 @@ mod tests {
             },
         );
     }
+
+    fn write_union_file(options: IpcWriteOptions) {
+        let schema = Schema::new(vec![Field::new(
+            "union",
+            DataType::Union(
+                vec![
+                    Field::new("a", DataType::Int32, false),
+                    Field::new("c", DataType::Float64, false),
+                ],
+                vec![0, 1],
+                UnionMode::Sparse,
+            ),
+            true,
+        )]);
+        let mut builder = UnionBuilder::new_sparse(5);
+        builder.append::<Int32Type>("a", 1).unwrap();
+        builder.append_null::<Int32Type>("a").unwrap();
+        builder.append::<Float64Type>("c", 3.0).unwrap();
+        builder.append_null::<Float64Type>("c").unwrap();
+        builder.append::<Int32Type>("a", 4).unwrap();
+        let union = builder.build().unwrap();
+
+        let batch = RecordBatch::try_new(
+            Arc::new(schema.clone()),
+            vec![Arc::new(union) as ArrayRef],
+        )
+        .unwrap();
+        let file_name = "target/debug/testdata/union.arrow_file";
+        {
+            let file = File::create(&file_name).unwrap();
+            let mut writer =
+                FileWriter::try_new_with_options(file, &schema, options).unwrap();
+
+            writer.write(&batch).unwrap();
+            writer.finish().unwrap();
+        }
+
+        {
+            let file = File::open(&file_name).unwrap();
+            let reader = FileReader::try_new(file, None).unwrap();
+            reader.for_each(|maybe_batch| {
+                maybe_batch
+                    .unwrap()
+                    .columns()
+                    .iter()
+                    .zip(batch.columns())
+                    .for_each(|(a, b)| {
+                        assert_eq!(a.data_type(), b.data_type());
+                        assert_eq!(a.len(), b.len());
+                        assert_eq!(a.null_count(), b.null_count());
+                    });
+            });
+        }
+    }
+
+    #[test]
+    fn test_write_union_file_v4_v5() {
+        write_union_file(
+            IpcWriteOptions::try_new(8, false, MetadataVersion::V4).unwrap(),
+        );
+        write_union_file(
+            IpcWriteOptions::try_new(8, false, MetadataVersion::V5).unwrap(),
+        );
+    }
 }