You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2021/01/01 10:48:33 UTC
[arrow] branch master updated: ARROW-10656: [Rust] Allow schema
validation to ignore field names and only check data types on new batch
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 118f462 ARROW-10656: [Rust] Allow schema validation to ignore field names and only check data types on new batch
118f462 is described below
commit 118f4622934409178cce97881c752474840571e4
Author: Neville Dipale <ne...@gmail.com>
AuthorDate: Fri Jan 1 05:47:30 2021 -0500
ARROW-10656: [Rust] Allow schema validation to ignore field names and only check data types on new batch
This adds the option to create a new record batch with less strict validation for list field names.
The default behaviour is preserved.
Closes #8988 from nevi-me/ARROW-10656
Authored-by: Neville Dipale <ne...@gmail.com>
Signed-off-by: Andrew Lamb <an...@nerdnetworks.org>
---
rust/arrow/src/datatypes.rs | 25 +++++++++++++
rust/arrow/src/record_batch.rs | 81 +++++++++++++++++++++++++++++++++++-------
2 files changed, 93 insertions(+), 13 deletions(-)
diff --git a/rust/arrow/src/datatypes.rs b/rust/arrow/src/datatypes.rs
index d2cf47e..125adc4 100644
--- a/rust/arrow/src/datatypes.rs
+++ b/rust/arrow/src/datatypes.rs
@@ -1246,6 +1246,31 @@ impl DataType {
| Float64
)
}
+
+ /// Compares the datatype with another, ignoring nested field names
+ /// and metadata
+ pub(crate) fn equals_datatype(&self, other: &DataType) -> bool {
+ match (&self, other) {
+ (DataType::List(a), DataType::List(b))
+ | (DataType::LargeList(a), DataType::LargeList(b)) => {
+ a.is_nullable() == b.is_nullable()
+ && a.data_type().equals_datatype(b.data_type())
+ }
+ (DataType::FixedSizeList(a, a_size), DataType::FixedSizeList(b, b_size)) => {
+ a_size == b_size
+ && a.is_nullable() == b.is_nullable()
+ && a.data_type().equals_datatype(b.data_type())
+ }
+ (DataType::Struct(a), DataType::Struct(b)) => {
+ a.len() == b.len()
+ && a.iter().zip(b).all(|(a, b)| {
+ a.is_nullable() == b.is_nullable()
+ && a.data_type().equals_datatype(b.data_type())
+ })
+ }
+ _ => self == other,
+ }
+ }
}
impl Field {
diff --git a/rust/arrow/src/record_batch.rs b/rust/arrow/src/record_batch.rs
index b4aa97d..14731b6 100644
--- a/rust/arrow/src/record_batch.rs
+++ b/rust/arrow/src/record_batch.rs
@@ -75,6 +75,25 @@ impl RecordBatch {
/// # }
/// ```
pub fn try_new(schema: SchemaRef, columns: Vec<ArrayRef>) -> Result<Self> {
+ let options = RecordBatchOptions::default();
+ Self::validate_new_batch(&schema, columns.as_slice(), &options)?;
+ Ok(RecordBatch { schema, columns })
+ }
+
+ pub fn try_new_with_options(
+ schema: SchemaRef,
+ columns: Vec<ArrayRef>,
+ options: &RecordBatchOptions,
+ ) -> Result<Self> {
+ Self::validate_new_batch(&schema, columns.as_slice(), options)?;
+ Ok(RecordBatch { schema, columns })
+ }
+
+ fn validate_new_batch(
+ schema: &SchemaRef,
+ columns: &[ArrayRef],
+ options: &RecordBatchOptions,
+ ) -> Result<()> {
// check that there are some columns
if columns.is_empty() {
return Err(ArrowError::InvalidArgumentError(
@@ -93,22 +112,45 @@ impl RecordBatch {
// check that all columns have the same row count, and match the schema
let len = columns[0].data().len();
- for (i, column) in columns.iter().enumerate() {
- if column.len() != len {
- return Err(ArrowError::InvalidArgumentError(
- "all columns in a record batch must have the same length".to_string(),
- ));
+ // This is a bit repetitive, but it is better to check the condition outside the loop
+ if options.match_field_names {
+ for (i, column) in columns.iter().enumerate() {
+ if column.len() != len {
+ return Err(ArrowError::InvalidArgumentError(
+ "all columns in a record batch must have the same length"
+ .to_string(),
+ ));
+ }
+ if column.data_type() != schema.field(i).data_type() {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "column types must match schema types, expected {:?} but found {:?} at column index {}",
+ schema.field(i).data_type(),
+ column.data_type(),
+ i)));
+ }
}
- // list types can have different names, but we only need the data types to be the same
- if column.data_type() != schema.field(i).data_type() {
- return Err(ArrowError::InvalidArgumentError(format!(
- "column types must match schema types, expected {:?} but found {:?} at column index {}",
- schema.field(i).data_type(),
- column.data_type(),
- i)));
+ } else {
+ for (i, column) in columns.iter().enumerate() {
+ if column.len() != len {
+ return Err(ArrowError::InvalidArgumentError(
+ "all columns in a record batch must have the same length"
+ .to_string(),
+ ));
+ }
+ if !column
+ .data_type()
+ .equals_datatype(schema.field(i).data_type())
+ {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "column types must match schema types, expected {:?} but found {:?} at column index {}",
+ schema.field(i).data_type(),
+ column.data_type(),
+ i)));
+ }
}
}
- Ok(RecordBatch { schema, columns })
+
+ Ok(())
}
/// Returns the [`Schema`](crate::datatypes::Schema) of the record batch.
@@ -187,6 +229,19 @@ impl RecordBatch {
}
}
+#[derive(Debug)]
+pub struct RecordBatchOptions {
+ pub match_field_names: bool,
+}
+
+impl Default for RecordBatchOptions {
+ fn default() -> Self {
+ Self {
+ match_field_names: true,
+ }
+ }
+}
+
impl From<&StructArray> for RecordBatch {
/// Create a record batch from struct array.
///