You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/04/12 23:44:27 UTC
[arrow-rs] branch master updated: Create RecordBatch With Non-Zero Row Count But No Columns (#1536) (#1552)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new c9549bba3 Create RecordBatch With Non-Zero Row Count But No Columns (#1536) (#1552)
c9549bba3 is described below
commit c9549bba3b03e2c79f4cbcf98060f8ad82566c40
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Wed Apr 13 00:44:22 2022 +0100
Create RecordBatch With Non-Zero Row Count But No Columns (#1536) (#1552)
* Support empty RecordBatch (#1536)
* Placate clippy
* Review feedback
* Fix doc
* Fix create_record_batch_slice_empty_batch test
---
arrow/src/record_batch.rs | 109 ++++++++++++++++++++++++++++++++-------------
arrow/src/util/data_gen.rs | 1 +
2 files changed, 78 insertions(+), 32 deletions(-)
diff --git a/arrow/src/record_batch.rs b/arrow/src/record_batch.rs
index 10bd4c5c5..ae8fae58f 100644
--- a/arrow/src/record_batch.rs
+++ b/arrow/src/record_batch.rs
@@ -41,6 +41,11 @@ use crate::error::{ArrowError, Result};
pub struct RecordBatch {
schema: SchemaRef,
columns: Vec<Arc<dyn Array>>,
+
+ /// The number of rows in this RecordBatch
+ ///
+ /// This is stored separately from the columns to handle the case of no columns
+ row_count: usize,
}
impl RecordBatch {
@@ -77,8 +82,7 @@ impl RecordBatch {
/// ```
pub fn try_new(schema: SchemaRef, columns: Vec<ArrayRef>) -> Result<Self> {
let options = RecordBatchOptions::default();
- Self::validate_new_batch(&schema, columns.as_slice(), &options)?;
- Ok(RecordBatch { schema, columns })
+ Self::try_new_impl(schema, columns, &options)
}
/// Creates a `RecordBatch` from a schema and columns, with additional options,
@@ -90,8 +94,7 @@ impl RecordBatch {
columns: Vec<ArrayRef>,
options: &RecordBatchOptions,
) -> Result<Self> {
- Self::validate_new_batch(&schema, columns.as_slice(), options)?;
- Ok(RecordBatch { schema, columns })
+ Self::try_new_impl(schema, columns, options)
}
/// Creates a new empty [`RecordBatch`].
@@ -101,23 +104,21 @@ impl RecordBatch {
.iter()
.map(|field| new_empty_array(field.data_type()))
.collect();
- RecordBatch { schema, columns }
+
+ RecordBatch {
+ schema,
+ columns,
+ row_count: 0,
+ }
}
/// Validate the schema and columns using [`RecordBatchOptions`]. Returns an error
- /// if any validation check fails.
- fn validate_new_batch(
- schema: &SchemaRef,
- columns: &[ArrayRef],
+ /// if any validation check fails, otherwise returns the created [`Self`]
+ fn try_new_impl(
+ schema: SchemaRef,
+ columns: Vec<ArrayRef>,
options: &RecordBatchOptions,
- ) -> Result<()> {
- // check that there are some columns
- if columns.is_empty() {
- return Err(ArrowError::InvalidArgumentError(
- "at least one column must be defined to create a record batch"
- .to_string(),
- ));
- }
+ ) -> Result<Self> {
// check that number of fields in schema match column length
if schema.fields().len() != columns.len() {
return Err(ArrowError::InvalidArgumentError(format!(
@@ -128,11 +129,23 @@ impl RecordBatch {
}
// check that all columns have the same row count
- let row_count = columns[0].data().len();
+ let row_count = options
+ .row_count
+ .or_else(|| columns.first().map(|col| col.len()))
+ .ok_or_else(|| {
+ ArrowError::InvalidArgumentError(
+ "must either specify a row count or at least one column".to_string(),
+ )
+ })?;
+
if columns.iter().any(|c| c.len() != row_count) {
- return Err(ArrowError::InvalidArgumentError(
- "all columns in a record batch must have the same length".to_string(),
- ));
+ let err = match options.row_count {
+ Some(_) => {
+ "all columns in a record batch must have the specified row count"
+ }
+ None => "all columns in a record batch must have the same length",
+ };
+ return Err(ArrowError::InvalidArgumentError(err.to_string()));
}
// function for comparing column type and field type
@@ -163,7 +176,11 @@ impl RecordBatch {
i)));
}
- Ok(())
+ Ok(RecordBatch {
+ schema,
+ columns,
+ row_count,
+ })
}
/// Returns the [`Schema`](crate::datatypes::Schema) of the record batch.
@@ -218,10 +235,6 @@ impl RecordBatch {
/// Returns the number of rows in each column.
///
- /// # Panics
- ///
- /// Panics if the `RecordBatch` contains no columns.
- ///
/// # Example
///
/// ```
@@ -243,7 +256,7 @@ impl RecordBatch {
/// # }
/// ```
pub fn num_rows(&self) -> usize {
- self.columns[0].data().len()
+ self.row_count
}
/// Get a reference to a column's array by index.
@@ -267,10 +280,6 @@ impl RecordBatch {
///
/// Panics if `offset` with `length` is greater than column length.
pub fn slice(&self, offset: usize, length: usize) -> RecordBatch {
- if self.schema.fields().is_empty() {
- assert!((offset + length) == 0);
- return RecordBatch::new_empty(self.schema.clone());
- }
assert!((offset + length) <= self.num_rows());
let columns = self
@@ -282,6 +291,7 @@ impl RecordBatch {
Self {
schema: self.schema.clone(),
columns,
+ row_count: length,
}
}
@@ -402,15 +412,20 @@ impl RecordBatch {
/// Options that control the behaviour used when creating a [`RecordBatch`].
#[derive(Debug)]
+#[non_exhaustive]
pub struct RecordBatchOptions {
/// Match field names of structs and lists. If set to `true`, the names must match.
pub match_field_names: bool,
+
+ /// Optional row count, useful for specifying a row count for a RecordBatch with no columns
+ pub row_count: Option<usize>,
}
impl Default for RecordBatchOptions {
fn default() -> Self {
Self {
match_field_names: true,
+ row_count: None,
}
}
}
@@ -426,6 +441,7 @@ impl From<&StructArray> for RecordBatch {
let columns = struct_array.boxed_fields.clone();
RecordBatch {
schema: Arc::new(schema),
+ row_count: struct_array.len(),
columns,
}
} else {
@@ -532,7 +548,7 @@ mod tests {
}
#[test]
- #[should_panic(expected = "assertion failed: (offset + length) == 0")]
+ #[should_panic(expected = "assertion failed: (offset + length) <= self.num_rows()")]
fn create_record_batch_slice_empty_batch() {
let schema = Schema::new(vec![]);
@@ -644,6 +660,7 @@ mod tests {
// creating the batch without field name validation should pass
let options = RecordBatchOptions {
match_field_names: false,
+ row_count: None,
};
let batch = RecordBatch::try_new_with_options(schema, vec![a], &options);
assert!(batch.is_ok());
@@ -934,4 +951,32 @@ mod tests {
assert_eq!(expected, record_batch.project(&[0, 2]).unwrap());
}
+
+ #[test]
+ fn test_no_column_record_batch() {
+ let schema = Arc::new(Schema::new(vec![]));
+
+ let err = RecordBatch::try_new(schema.clone(), vec![]).unwrap_err();
+ assert!(err
+ .to_string()
+ .contains("must either specify a row count or at least one column"));
+
+ let options = RecordBatchOptions {
+ row_count: Some(10),
+ ..Default::default()
+ };
+
+ let ok =
+ RecordBatch::try_new_with_options(schema.clone(), vec![], &options).unwrap();
+ assert_eq!(ok.num_rows(), 10);
+
+ let a = ok.slice(2, 5);
+ assert_eq!(a.num_rows(), 5);
+
+ let b = ok.slice(5, 0);
+ assert_eq!(b.num_rows(), 0);
+
+ assert_ne!(a, b);
+ assert_eq!(b, RecordBatch::new_empty(schema))
+ }
}
diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs
index 35b65ef30..21b8ee8c9 100644
--- a/arrow/src/util/data_gen.rs
+++ b/arrow/src/util/data_gen.rs
@@ -49,6 +49,7 @@ pub fn create_random_batch(
columns,
&RecordBatchOptions {
match_field_names: false,
+ row_count: None,
},
)
}