You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2019/04/20 09:10:42 UTC

[arrow] branch master updated: ARROW-5187: [Rust] Add ability to convert StructArray to RecordBatch

This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new b38936f  ARROW-5187: [Rust] Add ability to convert StructArray to RecordBatch
b38936f is described below

commit b38936f08bcfe1594f7e2b2a199920186a3f1fbe
Author: Neville Dipale <ne...@gmail.com>
AuthorDate: Sat Apr 20 11:10:23 2019 +0200

    ARROW-5187: [Rust] Add ability to convert StructArray to RecordBatch
    
    The CPP version (http://arrow.apache.org/docs/python/generated/pyarrow.StructArray.html?highlight=flatten#pyarrow.StructArray.flatten) returns `Vec<ArrayRef>`, so not sure if we should also do the same.
    My justification for returning a `RecordBatch` is that we have the added convenience of not losing the schema of the columns.
    I'm using this for reading rows from a SQL database into Arrow data.
    
    Author: Neville Dipale <ne...@gmail.com>
    
    Closes #4178 from nevi-me/ARROW-5187 and squashes the following commits:
    
    7afbcfecb <Neville Dipale> make struct_array -> record_batch a conversion trait impl
    418bfe1e2 <Neville Dipale> ARROW-5187:  Add StructArray::flatten
---
 rust/arrow/src/record_batch.rs | 51 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/rust/arrow/src/record_batch.rs b/rust/arrow/src/record_batch.rs
index 62f93b8..b166951 100644
--- a/rust/arrow/src/record_batch.rs
+++ b/rust/arrow/src/record_batch.rs
@@ -95,6 +95,24 @@ impl RecordBatch {
     }
 }
 
+impl From<&StructArray> for RecordBatch {
+    /// Create a record batch from struct array.
+    ///
+    /// This currently does not flatten and nested struct types
+    fn from(struct_array: &StructArray) -> Self {
+        if let DataType::Struct(fields) = struct_array.data_type() {
+            let schema = Schema::new(fields.clone());
+            let columns = struct_array.boxed_fields.clone();
+            RecordBatch {
+                schema: Arc::new(schema),
+                columns,
+            }
+        } else {
+            unreachable!("unable to get datatype as struct")
+        }
+    }
+}
+
 unsafe impl Send for RecordBatch {}
 unsafe impl Sync for RecordBatch {}
 
@@ -161,4 +179,37 @@ mod tests {
             RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a), Arc::new(b)]);
         assert!(!batch.is_ok());
     }
+
+    #[test]
+    fn create_record_batch_from_struct_array() {
+        let boolean_data = ArrayData::builder(DataType::Boolean)
+            .len(4)
+            .add_buffer(Buffer::from([12_u8]))
+            .build();
+        let int_data = ArrayData::builder(DataType::Int32)
+            .len(4)
+            .add_buffer(Buffer::from([42, 28, 19, 31].to_byte_slice()))
+            .build();
+        let struct_array = StructArray::from(vec![
+            (
+                Field::new("b", DataType::Boolean, false),
+                Arc::new(BooleanArray::from(vec![false, false, true, true]))
+                    as Arc<Array>,
+            ),
+            (
+                Field::new("c", DataType::Int32, false),
+                Arc::new(Int32Array::from(vec![42, 28, 19, 31])),
+            ),
+        ]);
+
+        let batch = RecordBatch::from(&struct_array);
+        assert_eq!(2, batch.num_columns());
+        assert_eq!(4, batch.num_rows());
+        assert_eq!(
+            struct_array.data_type(),
+            &DataType::Struct(batch.schema().fields().to_vec())
+        );
+        assert_eq!(batch.column(0).data(), boolean_data);
+        assert_eq!(batch.column(1).data(), int_data);
+    }
 }