You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2021/01/17 11:03:39 UTC
[arrow] branch master updated: ARROW-11271: [Rust] [Parquet] Fix
parquet list schema null conversion
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new e73f205 ARROW-11271: [Rust] [Parquet] Fix parquet list schema null conversion
e73f205 is described below
commit e73f205465051cb19cbec6900a01db8837948e9f
Author: Neville Dipale <ne...@gmail.com>
AuthorDate: Sun Jan 17 06:02:19 2021 -0500
ARROW-11271: [Rust] [Parquet] Fix parquet list schema null conversion
Please see the JIRA for the issue details https://issues.apache.org/jira/browse/ARROW-11271.
Closes #9216 from nevi-me/ARROW-11271
Authored-by: Neville Dipale <ne...@gmail.com>
Signed-off-by: Andrew Lamb <an...@nerdnetworks.org>
---
rust/parquet/src/arrow/schema.rs | 118 +++++++++++++++++++++++++++++++++++----
1 file changed, 107 insertions(+), 11 deletions(-)
diff --git a/rust/parquet/src/arrow/schema.rs b/rust/parquet/src/arrow/schema.rs
index 0c9da57..b9afcb6 100644
--- a/rust/parquet/src/arrow/schema.rs
+++ b/rust/parquet/src/arrow/schema.rs
@@ -29,10 +29,13 @@ use std::sync::Arc;
use arrow::datatypes::{DataType, DateUnit, Field, IntervalUnit, Schema, TimeUnit};
use arrow::ipc::writer;
-use crate::basic::{LogicalType, Repetition, Type as PhysicalType};
use crate::errors::{ParquetError::ArrowError, Result};
use crate::file::{metadata::KeyValue, properties::WriterProperties};
use crate::schema::types::{ColumnDescriptor, SchemaDescriptor, Type, TypePtr};
+use crate::{
+ basic::{LogicalType, Repetition, Type as PhysicalType},
+ errors::ParquetError,
+};
/// Convert Parquet schema to Arrow schema including optional metadata.
/// Attempts to decode any existing Arrow schema metadata, falling back
@@ -684,10 +687,10 @@ impl ParquetTypeConverter<'_> {
/// Converts a parquet group type to arrow struct.
fn to_struct(&self) -> Result<Option<DataType>> {
match self.schema {
- Type::PrimitiveType { .. } => panic!(
+ Type::PrimitiveType { .. } => Err(ParquetError::General(format!(
"{:?} is a struct type, and can't be processed as primitive.",
self.schema
- ),
+ ))),
Type::GroupType {
basic_info: _,
fields,
@@ -714,10 +717,10 @@ impl ParquetTypeConverter<'_> {
/// [parquet doc](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md).
fn to_list(&self) -> Result<Option<DataType>> {
match self.schema {
- Type::PrimitiveType { .. } => panic!(
+ Type::PrimitiveType { .. } => Err(ParquetError::General(format!(
"{:?} is a list type and can't be processed as primitive.",
self.schema
- ),
+ ))),
Type::GroupType {
basic_info: _,
fields,
@@ -757,12 +760,26 @@ impl ParquetTypeConverter<'_> {
}
};
+ // Check that the name of the list child is "list", in which case we
+ // get the child nullability and name (normally "element") from the nested
+ // group type.
+ // Without this step, the child incorrectly inherits the parent's optionality
+ let (list_item_name, item_is_optional) = match &item_converter.schema {
+ Type::GroupType { basic_info, fields }
+ if basic_info.name() == "list" && fields.len() == 1 =>
+ {
+ let field = fields.first().unwrap();
+ (field.name(), field.is_optional())
+ }
+ _ => (list_item.name(), list_item.is_optional()),
+ };
+
item_type.map(|opt| {
opt.map(|dt| {
DataType::List(Box::new(Field::new(
- list_item.name(),
+ list_item_name,
dt,
- list_item.is_optional(),
+ item_is_optional,
)))
})
})
@@ -939,7 +956,7 @@ mod tests {
{
arrow_fields.push(Field::new(
"my_list",
- DataType::List(Box::new(Field::new("list", DataType::Utf8, true))),
+ DataType::List(Box::new(Field::new("element", DataType::Utf8, true))),
false,
));
}
@@ -953,7 +970,7 @@ mod tests {
{
arrow_fields.push(Field::new(
"my_list",
- DataType::List(Box::new(Field::new("list", DataType::Utf8, true))),
+ DataType::List(Box::new(Field::new("element", DataType::Utf8, false))),
true,
));
}
@@ -972,10 +989,10 @@ mod tests {
// }
{
let arrow_inner_list =
- DataType::List(Box::new(Field::new("list", DataType::Int32, true)));
+ DataType::List(Box::new(Field::new("element", DataType::Int32, false)));
arrow_fields.push(Field::new(
"array_of_arrays",
- DataType::List(Box::new(Field::new("list", arrow_inner_list, true))),
+ DataType::List(Box::new(Field::new("element", arrow_inner_list, false))),
true,
));
}
@@ -1083,6 +1100,85 @@ mod tests {
}
#[test]
+ fn test_parquet_list_nullable() {
+ let mut arrow_fields = Vec::new();
+
+ let message_type = "
+ message test_schema {
+ REQUIRED GROUP my_list1 (LIST) {
+ REPEATED GROUP list {
+ OPTIONAL BINARY element (UTF8);
+ }
+ }
+ OPTIONAL GROUP my_list2 (LIST) {
+ REPEATED GROUP list {
+ REQUIRED BINARY element (UTF8);
+ }
+ }
+ REQUIRED GROUP my_list3 (LIST) {
+ REPEATED GROUP list {
+ REQUIRED BINARY element (UTF8);
+ }
+ }
+ }
+ ";
+
+ // // List<String> (list non-null, elements nullable)
+ // required group my_list1 (LIST) {
+ // repeated group list {
+ // optional binary element (UTF8);
+ // }
+ // }
+ {
+ arrow_fields.push(Field::new(
+ "my_list1",
+ DataType::List(Box::new(Field::new("element", DataType::Utf8, true))),
+ false,
+ ));
+ }
+
+ // // List<String> (list nullable, elements non-null)
+ // optional group my_list2 (LIST) {
+ // repeated group list {
+ // required binary element (UTF8);
+ // }
+ // }
+ {
+ arrow_fields.push(Field::new(
+ "my_list2",
+ DataType::List(Box::new(Field::new("element", DataType::Utf8, false))),
+ true,
+ ));
+ }
+
+ // // List<String> (list non-null, elements non-null)
+ // repeated group my_list3 (LIST) {
+ // repeated group list {
+ // required binary element (UTF8);
+ // }
+ // }
+ {
+ arrow_fields.push(Field::new(
+ "my_list3",
+ DataType::List(Box::new(Field::new("element", DataType::Utf8, false))),
+ false,
+ ));
+ }
+
+ let parquet_group_type = parse_message_type(message_type).unwrap();
+
+ let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_group_type));
+ let converted_arrow_schema =
+ parquet_to_arrow_schema(&parquet_schema, &None).unwrap();
+ let converted_fields = converted_arrow_schema.fields();
+
+ assert_eq!(arrow_fields.len(), converted_fields.len());
+ for i in 0..arrow_fields.len() {
+ assert_eq!(arrow_fields[i], converted_fields[i]);
+ }
+ }
+
+ #[test]
fn test_nested_schema() {
let mut arrow_fields = Vec::new();
{