You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/04/12 23:44:27 UTC

[arrow-rs] branch master updated: Create RecordBatch With Non-Zero Row Count But No Columns (#1536) (#1552)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new c9549bba3 Create RecordBatch With Non-Zero Row Count But No Columns (#1536) (#1552)
c9549bba3 is described below

commit c9549bba3b03e2c79f4cbcf98060f8ad82566c40
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Wed Apr 13 00:44:22 2022 +0100

    Create RecordBatch With Non-Zero Row Count But No Columns (#1536) (#1552)
    
    * Support empty RecordBatch (#1536)
    
    * Placate clippy
    
    * Review feedback
    
    * Fix doc
    
    * Fix create_record_batch_slice_empty_batch test
---
 arrow/src/record_batch.rs  | 109 ++++++++++++++++++++++++++++++++-------------
 arrow/src/util/data_gen.rs |   1 +
 2 files changed, 78 insertions(+), 32 deletions(-)

diff --git a/arrow/src/record_batch.rs b/arrow/src/record_batch.rs
index 10bd4c5c5..ae8fae58f 100644
--- a/arrow/src/record_batch.rs
+++ b/arrow/src/record_batch.rs
@@ -41,6 +41,11 @@ use crate::error::{ArrowError, Result};
 pub struct RecordBatch {
     schema: SchemaRef,
     columns: Vec<Arc<dyn Array>>,
+
+    /// The number of rows in this RecordBatch
+    ///
+    /// This is stored separately from the columns to handle the case of no columns
+    row_count: usize,
 }
 
 impl RecordBatch {
@@ -77,8 +82,7 @@ impl RecordBatch {
     /// ```
     pub fn try_new(schema: SchemaRef, columns: Vec<ArrayRef>) -> Result<Self> {
         let options = RecordBatchOptions::default();
-        Self::validate_new_batch(&schema, columns.as_slice(), &options)?;
-        Ok(RecordBatch { schema, columns })
+        Self::try_new_impl(schema, columns, &options)
     }
 
     /// Creates a `RecordBatch` from a schema and columns, with additional options,
@@ -90,8 +94,7 @@ impl RecordBatch {
         columns: Vec<ArrayRef>,
         options: &RecordBatchOptions,
     ) -> Result<Self> {
-        Self::validate_new_batch(&schema, columns.as_slice(), options)?;
-        Ok(RecordBatch { schema, columns })
+        Self::try_new_impl(schema, columns, options)
     }
 
     /// Creates a new empty [`RecordBatch`].
@@ -101,23 +104,21 @@ impl RecordBatch {
             .iter()
             .map(|field| new_empty_array(field.data_type()))
             .collect();
-        RecordBatch { schema, columns }
+
+        RecordBatch {
+            schema,
+            columns,
+            row_count: 0,
+        }
     }
 
     /// Validate the schema and columns using [`RecordBatchOptions`]. Returns an error
-    /// if any validation check fails.
-    fn validate_new_batch(
-        schema: &SchemaRef,
-        columns: &[ArrayRef],
+    /// if any validation check fails, otherwise returns the created [`Self`]
+    fn try_new_impl(
+        schema: SchemaRef,
+        columns: Vec<ArrayRef>,
         options: &RecordBatchOptions,
-    ) -> Result<()> {
-        // check that there are some columns
-        if columns.is_empty() {
-            return Err(ArrowError::InvalidArgumentError(
-                "at least one column must be defined to create a record batch"
-                    .to_string(),
-            ));
-        }
+    ) -> Result<Self> {
         // check that number of fields in schema match column length
         if schema.fields().len() != columns.len() {
             return Err(ArrowError::InvalidArgumentError(format!(
@@ -128,11 +129,23 @@ impl RecordBatch {
         }
 
         // check that all columns have the same row count
-        let row_count = columns[0].data().len();
+        let row_count = options
+            .row_count
+            .or_else(|| columns.first().map(|col| col.len()))
+            .ok_or_else(|| {
+                ArrowError::InvalidArgumentError(
+                    "must either specify a row count or at least one column".to_string(),
+                )
+            })?;
+
         if columns.iter().any(|c| c.len() != row_count) {
-            return Err(ArrowError::InvalidArgumentError(
-                "all columns in a record batch must have the same length".to_string(),
-            ));
+            let err = match options.row_count {
+                Some(_) => {
+                    "all columns in a record batch must have the specified row count"
+                }
+                None => "all columns in a record batch must have the same length",
+            };
+            return Err(ArrowError::InvalidArgumentError(err.to_string()));
         }
 
         // function for comparing column type and field type
@@ -163,7 +176,11 @@ impl RecordBatch {
                 i)));
         }
 
-        Ok(())
+        Ok(RecordBatch {
+            schema,
+            columns,
+            row_count,
+        })
     }
 
     /// Returns the [`Schema`](crate::datatypes::Schema) of the record batch.
@@ -218,10 +235,6 @@ impl RecordBatch {
 
     /// Returns the number of rows in each column.
     ///
-    /// # Panics
-    ///
-    /// Panics if the `RecordBatch` contains no columns.
-    ///
     /// # Example
     ///
     /// ```
@@ -243,7 +256,7 @@ impl RecordBatch {
     /// # }
     /// ```
     pub fn num_rows(&self) -> usize {
-        self.columns[0].data().len()
+        self.row_count
     }
 
     /// Get a reference to a column's array by index.
@@ -267,10 +280,6 @@ impl RecordBatch {
     ///
     /// Panics if `offset` with `length` is greater than column length.
     pub fn slice(&self, offset: usize, length: usize) -> RecordBatch {
-        if self.schema.fields().is_empty() {
-            assert!((offset + length) == 0);
-            return RecordBatch::new_empty(self.schema.clone());
-        }
         assert!((offset + length) <= self.num_rows());
 
         let columns = self
@@ -282,6 +291,7 @@ impl RecordBatch {
         Self {
             schema: self.schema.clone(),
             columns,
+            row_count: length,
         }
     }
 
@@ -402,15 +412,20 @@ impl RecordBatch {
 
 /// Options that control the behaviour used when creating a [`RecordBatch`].
 #[derive(Debug)]
+#[non_exhaustive]
 pub struct RecordBatchOptions {
     /// Match field names of structs and lists. If set to `true`, the names must match.
     pub match_field_names: bool,
+
+    /// Optional row count, useful for specifying a row count for a RecordBatch with no columns
+    pub row_count: Option<usize>,
 }
 
 impl Default for RecordBatchOptions {
     fn default() -> Self {
         Self {
             match_field_names: true,
+            row_count: None,
         }
     }
 }
@@ -426,6 +441,7 @@ impl From<&StructArray> for RecordBatch {
             let columns = struct_array.boxed_fields.clone();
             RecordBatch {
                 schema: Arc::new(schema),
+                row_count: struct_array.len(),
                 columns,
             }
         } else {
@@ -532,7 +548,7 @@ mod tests {
     }
 
     #[test]
-    #[should_panic(expected = "assertion failed: (offset + length) == 0")]
+    #[should_panic(expected = "assertion failed: (offset + length) <= self.num_rows()")]
     fn create_record_batch_slice_empty_batch() {
         let schema = Schema::new(vec![]);
 
@@ -644,6 +660,7 @@ mod tests {
         // creating the batch without field name validation should pass
         let options = RecordBatchOptions {
             match_field_names: false,
+            row_count: None,
         };
         let batch = RecordBatch::try_new_with_options(schema, vec![a], &options);
         assert!(batch.is_ok());
@@ -934,4 +951,32 @@ mod tests {
 
         assert_eq!(expected, record_batch.project(&[0, 2]).unwrap());
     }
+
+    #[test]
+    fn test_no_column_record_batch() {
+        let schema = Arc::new(Schema::new(vec![]));
+
+        let err = RecordBatch::try_new(schema.clone(), vec![]).unwrap_err();
+        assert!(err
+            .to_string()
+            .contains("must either specify a row count or at least one column"));
+
+        let options = RecordBatchOptions {
+            row_count: Some(10),
+            ..Default::default()
+        };
+
+        let ok =
+            RecordBatch::try_new_with_options(schema.clone(), vec![], &options).unwrap();
+        assert_eq!(ok.num_rows(), 10);
+
+        let a = ok.slice(2, 5);
+        assert_eq!(a.num_rows(), 5);
+
+        let b = ok.slice(5, 0);
+        assert_eq!(b.num_rows(), 0);
+
+        assert_ne!(a, b);
+        assert_eq!(b, RecordBatch::new_empty(schema))
+    }
 }
diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs
index 35b65ef30..21b8ee8c9 100644
--- a/arrow/src/util/data_gen.rs
+++ b/arrow/src/util/data_gen.rs
@@ -49,6 +49,7 @@ pub fn create_random_batch(
         columns,
         &RecordBatchOptions {
             match_field_names: false,
+            row_count: None,
         },
     )
 }