You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/12/01 14:52:02 UTC
[arrow-datafusion] branch master updated: Add check for nested types in equivalent names and types (#4380)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/master by this push:
new a0485e7e0 Add check for nested types in equivalent names and types (#4380)
a0485e7e0 is described below
commit a0485e7e0fc69aa5dca35339108142a0fd1dc703
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Thu Dec 1 09:51:57 2022 -0500
Add check for nested types in equivalent names and types (#4380)
* Add check for nested types in equivalent names and types
* Clippy
---
datafusion/common/src/dfschema.rs | 151 +++++++++++++++++++++++++++++++++++++-
1 file changed, 149 insertions(+), 2 deletions(-)
diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs
index 005488153..8ad179c5b 100644
--- a/datafusion/common/src/dfschema.rs
+++ b/datafusion/common/src/dfschema.rs
@@ -351,11 +351,47 @@ impl DFSchema {
let other_fields = other.fields().iter();
self_fields.zip(other_fields).all(|(f1, f2)| {
f1.qualifier() == f2.qualifier()
- && f1.data_type() == f2.data_type()
&& f1.name() == f2.name()
+ && Self::datatype_is_semantically_equal(f1.data_type(), f2.data_type())
})
}
+ /// Returns true of two [`DataType`]s are semantically equal (same
+ /// name and type), ignoring both metadata and nullability.
+ ///
+ /// request to upstream: <https://github.com/apache/arrow-rs/issues/3199>
+ fn datatype_is_semantically_equal(dt1: &DataType, dt2: &DataType) -> bool {
+ // check nested fields
+ match (dt1, dt2) {
+ (DataType::Dictionary(k1, v1), DataType::Dictionary(k2, v2)) => {
+ Self::datatype_is_semantically_equal(k1.as_ref(), k2.as_ref())
+ && Self::datatype_is_semantically_equal(v1.as_ref(), v2.as_ref())
+ }
+ (DataType::List(f1), DataType::List(f2))
+ | (DataType::LargeList(f1), DataType::LargeList(f2))
+ | (DataType::FixedSizeList(f1, _), DataType::FixedSizeList(f2, _))
+ | (DataType::Map(f1, _), DataType::Map(f2, _)) => {
+ Self::field_is_semantically_equal(f1, f2)
+ }
+ (DataType::Struct(fields1), DataType::Struct(fields2))
+ | (DataType::Union(fields1, _, _), DataType::Union(fields2, _, _)) => {
+ let iter1 = fields1.iter();
+ let iter2 = fields2.iter();
+ fields1.len() == fields2.len() &&
+ // all fields have to be the same
+ iter1
+ .zip(iter2)
+ .all(|(f1, f2)| Self::field_is_semantically_equal(f1, f2))
+ }
+ _ => dt1 == dt2,
+ }
+ }
+
+ fn field_is_semantically_equal(f1: &Field, f2: &Field) -> bool {
+ f1.name() == f2.name()
+ && Self::datatype_is_semantically_equal(f1.data_type(), f2.data_type())
+ }
+
/// Strip all field qualifier in schema
pub fn strip_qualifiers(self) -> Self {
DFSchema {
@@ -806,6 +842,51 @@ mod tests {
let field2_i16_t = DFField::from(Field::new("f2", DataType::Int16, true));
let field3_i16_t = DFField::from(Field::new("f3", DataType::Int16, true));
+ let dict =
+ DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8));
+ let field_dict_t = DFField::from(Field::new("f_dict", dict.clone(), true));
+ let field_dict_f = DFField::from(Field::new("f_dict", dict, false));
+
+ let list_t = DFField::from(Field::new(
+ "f_list",
+ DataType::List(Box::new(field1_i16_t.field().clone())),
+ true,
+ ));
+ let list_f = DFField::from(Field::new(
+ "f_list",
+ DataType::List(Box::new(field1_i16_f.field().clone())),
+ false,
+ ));
+
+ let list_f_name = DFField::from(Field::new(
+ "f_list",
+ DataType::List(Box::new(field2_i16_t.field().clone())),
+ false,
+ ));
+
+ let struct_t = DFField::from(Field::new(
+ "f_struct",
+ DataType::Struct(vec![field1_i16_t.field().clone()]),
+ true,
+ ));
+ let struct_f = DFField::from(Field::new(
+ "f_struct",
+ DataType::Struct(vec![field1_i16_f.field().clone()]),
+ false,
+ ));
+
+ let struct_f_meta = DFField::from(Field::new(
+ "f_struct",
+ DataType::Struct(vec![field1_i16_t_meta.field().clone()]),
+ false,
+ ));
+
+ let struct_f_type = DFField::from(Field::new(
+ "f_struct",
+ DataType::Struct(vec![field1_i32_t.field().clone()]),
+ false,
+ ));
+
// same
TestCase {
fields1: vec![&field1_i16_t],
@@ -870,6 +951,70 @@ mod tests {
}
.run();
+ // dictionary
+ TestCase {
+ fields1: vec![&field_dict_t],
+ fields2: vec![&field_dict_t],
+ expected: true,
+ }
+ .run();
+
+ // dictionary (different nullable)
+ TestCase {
+ fields1: vec![&field_dict_t],
+ fields2: vec![&field_dict_f],
+ expected: true,
+ }
+ .run();
+
+ // dictionary (wrong type)
+ TestCase {
+ fields1: vec![&field_dict_t],
+ fields2: vec![&field1_i16_t],
+ expected: false,
+ }
+ .run();
+
+ // list (different embedded nullability)
+ TestCase {
+ fields1: vec![&list_t],
+ fields2: vec![&list_f],
+ expected: true,
+ }
+ .run();
+
+ // list (different sub field names)
+ TestCase {
+ fields1: vec![&list_t],
+ fields2: vec![&list_f_name],
+ expected: false,
+ }
+ .run();
+
+ // struct
+ TestCase {
+ fields1: vec![&struct_t],
+ fields2: vec![&struct_f],
+ expected: true,
+ }
+ .run();
+
+ // struct (different embedded meta)
+ TestCase {
+ fields1: vec![&struct_t],
+ fields2: vec![&struct_f_meta],
+ expected: true,
+ }
+ .run();
+
+ // struct (different field type)
+ TestCase {
+ fields1: vec![&struct_t],
+ fields2: vec![&struct_f_type],
+ expected: false,
+ }
+ .run();
+
#[derive(Debug)]
struct TestCase<'a> {
fields1: Vec<&'a DFField>,
@@ -885,7 +1030,9 @@ mod tests {
assert_eq!(
schema1.equivalent_names_and_types(&schema2),
self.expected,
- "schema1:\n\n{:#?}\n\nschema2:\n\n{:#?}",
+ "Comparison did not match expected: {}\n\n\
+ schema1:\n\n{:#?}\n\nschema2:\n\n{:#?}",
+ self.expected,
schema1,
schema2
);