You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ag...@apache.org on 2019/01/08 13:45:21 UTC
[arrow] branch master updated: ARROW-4060: [Rust] Add parquet arrow
converter.
This is an automated email from the ASF dual-hosted git repository.
agrove pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new af07f75 ARROW-4060: [Rust] Add parquet arrow converter.
af07f75 is described below
commit af07f75c1f692d1ed4cea93d358ff1acda6a1771
Author: Renjie Liu <li...@gmail.com>
AuthorDate: Tue Jan 8 06:45:13 2019 -0700
ARROW-4060: [Rust] Add parquet arrow converter.
This is the first step of adding an arrow reader and writer for parquet-rs.
This commit contains a converter which converts parquet schema to arrow schema.
Copied from this pr https://github.com/sunchao/parquet-rs/pull/185.
Author: Renjie Liu <li...@gmail.com>
Closes #3279 from liurenjie1024/rust-arrow-schema-converter and squashes the following commits:
1bfa00f <Renjie Liu> Resolve conflict
8806b16 <Renjie Liu> Add parquet arrow converter
---
rust/parquet/src/errors.rs | 6 +
rust/parquet/src/lib.rs | 1 +
rust/parquet/src/{lib.rs => reader/mod.rs} | 28 +-
rust/parquet/src/reader/schema.rs | 779 +++++++++++++++++++++++++++++
rust/parquet/src/schema/types.rs | 14 +-
5 files changed, 805 insertions(+), 23 deletions(-)
diff --git a/rust/parquet/src/errors.rs b/rust/parquet/src/errors.rs
index a5532c1..abfbda9 100644
--- a/rust/parquet/src/errors.rs
+++ b/rust/parquet/src/errors.rs
@@ -50,6 +50,12 @@ quick_error! {
display("EOF: {}", message)
description(message)
}
+ /// Arrow error.
+ /// Returned when reading into arrow or writing from arrow.
+ ArrowError(message: String) {
+ display("Arrow: {}", message)
+ description(message)
+ }
}
}
diff --git a/rust/parquet/src/lib.rs b/rust/parquet/src/lib.rs
index 75c56f5..cad85ec 100644
--- a/rust/parquet/src/lib.rs
+++ b/rust/parquet/src/lib.rs
@@ -37,5 +37,6 @@ pub mod column;
pub mod compression;
mod encodings;
pub mod file;
+pub mod reader;
pub mod record;
pub mod schema;
diff --git a/rust/parquet/src/lib.rs b/rust/parquet/src/reader/mod.rs
similarity index 64%
copy from rust/parquet/src/lib.rs
copy to rust/parquet/src/reader/mod.rs
index 75c56f5..fe580c5 100644
--- a/rust/parquet/src/lib.rs
+++ b/rust/parquet/src/reader/mod.rs
@@ -15,27 +15,11 @@
// specific language governing permissions and limitations
// under the License.
-#![feature(type_ascription)]
-#![feature(rustc_private)]
-#![feature(specialization)]
-#![feature(try_from)]
-#![allow(dead_code)]
-#![allow(non_camel_case_types)]
+//! [Apache Arrow](http://arrow.apache.org/) is a cross-language development platform for
+//! in-memory data.
+//!
+//! This mod provides API for converting between arrow and parquet.
-#[macro_use]
-pub mod errors;
-pub mod basic;
-pub mod data_type;
-
-// Exported for external use, such as benchmarks
-pub use self::encodings::{decoding, encoding};
-pub use self::util::memory;
-
-#[macro_use]
-mod util;
-pub mod column;
-pub mod compression;
-mod encodings;
-pub mod file;
-pub mod record;
pub mod schema;
+
+pub use self::schema::{parquet_to_arrow_schema, parquet_to_arrow_schema_by_columns};
diff --git a/rust/parquet/src/reader/schema.rs b/rust/parquet/src/reader/schema.rs
new file mode 100644
index 0000000..68fd867
--- /dev/null
+++ b/rust/parquet/src/reader/schema.rs
@@ -0,0 +1,779 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Provides API for converting parquet schema to arrow schema and vice versa.
+//!
+//! The main interfaces for converting parquet schema to arrow schema are
+//! `parquet_to_arrow_schema` and `parquet_to_arrow_schema_by_columns`.
+//!
+//! The interfaces for converting arrow schema to parquet schema is coming.
+
+use std::{collections::HashSet, rc::Rc};
+
+use crate::basic::{LogicalType, Repetition, Type as PhysicalType};
+use crate::errors::{ParquetError::ArrowError, Result};
+use crate::schema::types::{SchemaDescPtr, Type, TypePtr};
+
+use arrow::datatypes::{DataType, Field, Schema};
+
+/// Convert parquet schema to arrow schema.
+pub fn parquet_to_arrow_schema(parquet_schema: SchemaDescPtr) -> Result<Schema> {
+ parquet_to_arrow_schema_by_columns(parquet_schema.clone(), 0..parquet_schema.columns().len())
+}
+
+/// Convert parquet schema to arrow schema, only preserving some leaf columns.
+pub fn parquet_to_arrow_schema_by_columns<T>(
+ parquet_schema: SchemaDescPtr,
+ column_indices: T,
+) -> Result<Schema>
+where
+ T: IntoIterator<Item = usize>,
+{
+ let mut base_nodes = Vec::new();
+ let mut base_nodes_set = HashSet::new();
+ let mut leaves = HashSet::new();
+
+ for c in column_indices {
+ let column = parquet_schema.column(c).self_type() as *const Type;
+ let root = parquet_schema.get_column_root_ptr(c);
+ let root_raw_ptr = root.clone().as_ref() as *const Type;
+
+ leaves.insert(column);
+ if !base_nodes_set.contains(&root_raw_ptr) {
+ base_nodes.push(root);
+ base_nodes_set.insert(root_raw_ptr);
+ }
+ }
+
+ let leaves = Rc::new(leaves);
+ base_nodes
+ .into_iter()
+ .map(|t| ParquetTypeConverter::new(t, leaves.clone()).to_field())
+ .collect::<Result<Vec<Option<Field>>>>()
+ .map(|result| result.into_iter().filter_map(|f| f).collect::<Vec<Field>>())
+ .map(|fields| Schema::new(fields))
+}
+
+/// This struct is used to group methods and data structures used to convert parquet
+/// schema together.
+struct ParquetTypeConverter {
+ schema: TypePtr,
+ /// This is the columns that need to be converted to arrow schema.
+ columns_to_convert: Rc<HashSet<*const Type>>,
+}
+
+impl ParquetTypeConverter {
+ fn new(schema: TypePtr, columns_to_convert: Rc<HashSet<*const Type>>) -> Self {
+ Self {
+ schema,
+ columns_to_convert,
+ }
+ }
+
+ fn clone_with_schema(&self, other: TypePtr) -> Self {
+ Self {
+ schema: other,
+ columns_to_convert: self.columns_to_convert.clone(),
+ }
+ }
+}
+
+impl ParquetTypeConverter {
+ // Public interfaces.
+
+ /// Converts parquet schema to arrow data type.
+ ///
+ /// This function discards schema name.
+ ///
+ /// If this schema is a primitive type and not included in the leaves, the result is
+ /// Ok(None).
+ ///
+ /// If this schema is a group type and none of its children is reserved in the
+ /// conversion, the result is Ok(None).
+ fn to_data_type(&self) -> Result<Option<DataType>> {
+ match self.schema.as_ref() {
+ Type::PrimitiveType { .. } => self.to_primitive_type(),
+ Type::GroupType { .. } => self.to_group_type(),
+ }
+ }
+
+ /// Converts parquet schema to arrow field.
+ ///
+ /// This method is roughly the same as
+ /// [`to_data_type`](`ParquetTypeConverter::to_data_type`), except it reserves schema
+ /// name.
+ fn to_field(&self) -> Result<Option<Field>> {
+ self.to_data_type()
+ .map(|opt| opt.map(|dt| Field::new(self.schema.name(), dt, self.is_nullable())))
+ }
+
+ // Utility functions.
+
+ /// Checks whether this schema is nullable.
+ fn is_nullable(&self) -> bool {
+ let basic_info = self.schema.get_basic_info();
+ if basic_info.has_repetition() {
+ match basic_info.repetition() {
+ Repetition::OPTIONAL => true,
+ Repetition::REPEATED => true,
+ Repetition::REQUIRED => false,
+ }
+ } else {
+ false
+ }
+ }
+
+ fn is_repeated(&self) -> bool {
+ let basic_info = self.schema.get_basic_info();
+
+ basic_info.has_repetition() && basic_info.repetition() == Repetition::REPEATED
+ }
+
+ fn is_self_included(&self) -> bool {
+ self.columns_to_convert
+ .contains(&(self.schema.as_ref() as *const Type))
+ }
+
+ // Functions for primitive types.
+
+ /// Entry point for converting parquet primitive type to arrow type.
+ ///
+ /// This function takes care of repetition.
+ fn to_primitive_type(&self) -> Result<Option<DataType>> {
+ if self.is_self_included() {
+ self.to_primitive_type_inner().map(|dt| {
+ if self.is_repeated() {
+ Some(DataType::List(Box::new(dt)))
+ } else {
+ Some(dt)
+ }
+ })
+ } else {
+ Ok(None)
+ }
+ }
+
+ /// Converting parquet primitive type to arrow data type.
+ fn to_primitive_type_inner(&self) -> Result<DataType> {
+ match self.schema.get_physical_type() {
+ PhysicalType::BOOLEAN => Ok(DataType::Boolean),
+ PhysicalType::INT32 => self.to_int32(),
+ PhysicalType::INT64 => self.to_int64(),
+ PhysicalType::FLOAT => Ok(DataType::Float32),
+ PhysicalType::DOUBLE => Ok(DataType::Float64),
+ PhysicalType::BYTE_ARRAY => self.to_byte_array(),
+ other => Err(ArrowError(format!(
+ "Unable to convert parquet type {}",
+ other
+ ))),
+ }
+ }
+
+ fn to_int32(&self) -> Result<DataType> {
+ match self.schema.get_basic_info().logical_type() {
+ LogicalType::NONE => Ok(DataType::Int32),
+ LogicalType::UINT_8 => Ok(DataType::UInt8),
+ LogicalType::UINT_16 => Ok(DataType::UInt16),
+ LogicalType::UINT_32 => Ok(DataType::UInt32),
+ LogicalType::INT_8 => Ok(DataType::Int8),
+ LogicalType::INT_16 => Ok(DataType::Int16),
+ LogicalType::INT_32 => Ok(DataType::Int32),
+ other => Err(ArrowError(format!(
+ "Unable to convert parquet logical type {}",
+ other
+ ))),
+ }
+ }
+
+ fn to_int64(&self) -> Result<DataType> {
+ match self.schema.get_basic_info().logical_type() {
+ LogicalType::NONE => Ok(DataType::Int64),
+ LogicalType::INT_64 => Ok(DataType::Int64),
+ LogicalType::UINT_64 => Ok(DataType::UInt64),
+ other => Err(ArrowError(format!(
+ "Unable to convert parquet logical type {}",
+ other
+ ))),
+ }
+ }
+
+ fn to_byte_array(&self) -> Result<DataType> {
+ match self.schema.get_basic_info().logical_type() {
+ LogicalType::UTF8 => Ok(DataType::Utf8),
+ other => Err(ArrowError(format!(
+ "Unable to convert parquet logical type {}",
+ other
+ ))),
+ }
+ }
+
+ // Functions for group types.
+
+ /// Entry point for converting parquet group type.
+ ///
+ /// This function takes care of logical type and repetition.
+ fn to_group_type(&self) -> Result<Option<DataType>> {
+ if self.is_repeated() {
+ self.to_struct()
+ .map(|opt| opt.map(|dt| DataType::List(Box::new(dt))))
+ } else {
+ match self.schema.get_basic_info().logical_type() {
+ LogicalType::LIST => self.to_list(),
+ _ => self.to_struct(),
+ }
+ }
+ }
+
+ /// Converts a parquet group type to arrow struct.
+ fn to_struct(&self) -> Result<Option<DataType>> {
+ match self.schema.as_ref() {
+ Type::PrimitiveType { .. } => panic!(
+ "{:?} is a struct type, and can't be processed as primitive.",
+ self.schema
+ ),
+ Type::GroupType {
+ basic_info: _,
+ fields,
+ } => fields
+ .iter()
+ .map(|field_ptr| self.clone_with_schema(field_ptr.clone()).to_field())
+ .collect::<Result<Vec<Option<Field>>>>()
+ .map(|result| result.into_iter().filter_map(|f| f).collect::<Vec<Field>>())
+ .map(|fields| {
+ if fields.is_empty() {
+ None
+ } else {
+ Some(DataType::Struct(fields))
+ }
+ }),
+ }
+ }
+
+ /// Converts a parquet list to arrow list.
+ ///
+ /// To fully understand this algorithm, please refer to
+ /// [parquet doc](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md).
+ fn to_list(&self) -> Result<Option<DataType>> {
+ match self.schema.as_ref() {
+ Type::PrimitiveType { .. } => panic!(
+ "{:?} is a list type and can't be processed as primitive.",
+ self.schema
+ ),
+ Type::GroupType {
+ basic_info: _,
+ fields,
+ } if fields.len() == 1 => {
+ let list_item = fields.first().unwrap();
+ let item_converter = self.clone_with_schema(list_item.clone());
+
+ let item_type = match list_item.as_ref() {
+ Type::PrimitiveType { .. } => {
+ if item_converter.is_repeated() {
+ item_converter.to_primitive_type_inner().map(|dt| Some(dt))
+ } else {
+ Err(ArrowError(
+ "Primitive element type of list must be repeated.".to_string(),
+ ))
+ }
+ }
+ Type::GroupType {
+ basic_info: _,
+ fields,
+ } => {
+ if fields.len() > 1 {
+ item_converter.to_struct()
+ } else if fields.len() == 1
+ && list_item.name() != "array"
+ && list_item.name() != format!("{}_tuple", self.schema.name())
+ {
+ let nested_item = fields.first().unwrap();
+ let nested_item_converter = self.clone_with_schema(nested_item.clone());
+
+ nested_item_converter.to_data_type()
+ } else {
+ item_converter.to_struct()
+ }
+ }
+ };
+
+ item_type.map(|opt| opt.map(|dt| DataType::List(Box::new(dt))))
+ }
+ _ => Err(ArrowError(
+ "Group element type of list can only contain one field.".to_string(),
+ )),
+ }
+ }
+}
+
+#[cfg(test)]
+mod tests {
+ use std::rc::Rc;
+
+ use crate::schema::{parser::parse_message_type, types::SchemaDescriptor};
+
+ use arrow::datatypes::{DataType, Field};
+
+ use super::{parquet_to_arrow_schema, parquet_to_arrow_schema_by_columns};
+
+ #[test]
+ fn test_flat_primitives() {
+ let message_type = "
+ message test_schema {
+ REQUIRED BOOLEAN boolean;
+ REQUIRED INT32 int8 (INT_8);
+ REQUIRED INT32 int16 (INT_16);
+ REQUIRED INT32 int32;
+ REQUIRED INT64 int64 ;
+ OPTIONAL DOUBLE double;
+ OPTIONAL FLOAT float;
+ OPTIONAL BINARY string (UTF8);
+ }
+ ";
+ let parquet_group_type = parse_message_type(message_type).unwrap();
+
+ let parquet_schema = SchemaDescriptor::new(Rc::new(parquet_group_type));
+ let converted_arrow_schema = parquet_to_arrow_schema(Rc::new(parquet_schema)).unwrap();
+
+ let arrow_fields = vec![
+ Field::new("boolean", DataType::Boolean, false),
+ Field::new("int8", DataType::Int8, false),
+ Field::new("int16", DataType::Int16, false),
+ Field::new("int32", DataType::Int32, false),
+ Field::new("int64", DataType::Int64, false),
+ Field::new("double", DataType::Float64, true),
+ Field::new("float", DataType::Float32, true),
+ Field::new("string", DataType::Utf8, true),
+ ];
+
+ assert_eq!(&arrow_fields, converted_arrow_schema.fields());
+ }
+
+ #[test]
+ fn test_duplicate_fields() {
+ let message_type = "
+ message test_schema {
+ REQUIRED BOOLEAN boolean;
+ REQUIRED INT32 int8 (INT_8);
+ }
+ ";
+
+ let parquet_group_type = parse_message_type(message_type).unwrap();
+
+ let parquet_schema = Rc::new(SchemaDescriptor::new(Rc::new(parquet_group_type)));
+ let converted_arrow_schema = parquet_to_arrow_schema(parquet_schema.clone()).unwrap();
+
+ let arrow_fields = vec![
+ Field::new("boolean", DataType::Boolean, false),
+ Field::new("int8", DataType::Int8, false),
+ ];
+ assert_eq!(&arrow_fields, converted_arrow_schema.fields());
+
+ let converted_arrow_schema =
+ parquet_to_arrow_schema_by_columns(parquet_schema.clone(), vec![0usize, 1usize])
+ .unwrap();
+ assert_eq!(&arrow_fields, converted_arrow_schema.fields());
+ }
+
+ #[test]
+ fn test_parquet_lists() {
+ let mut arrow_fields = Vec::new();
+
+ // LIST encoding example taken from parquet-format/LogicalTypes.md
+ let message_type = "
+ message test_schema {
+ REQUIRED GROUP my_list (LIST) {
+ REPEATED GROUP list {
+ OPTIONAL BINARY element (UTF8);
+ }
+ }
+ OPTIONAL GROUP my_list (LIST) {
+ REPEATED GROUP list {
+ REQUIRED BINARY element (UTF8);
+ }
+ }
+ OPTIONAL GROUP array_of_arrays (LIST) {
+ REPEATED GROUP list {
+ REQUIRED GROUP element (LIST) {
+ REPEATED GROUP list {
+ REQUIRED INT32 element;
+ }
+ }
+ }
+ }
+ OPTIONAL GROUP my_list (LIST) {
+ REPEATED GROUP element {
+ REQUIRED BINARY str (UTF8);
+ }
+ }
+ OPTIONAL GROUP my_list (LIST) {
+ REPEATED INT32 element;
+ }
+ OPTIONAL GROUP my_list (LIST) {
+ REPEATED GROUP element {
+ REQUIRED BINARY str (UTF8);
+ REQUIRED INT32 num;
+ }
+ }
+ OPTIONAL GROUP my_list (LIST) {
+ REPEATED GROUP array {
+ REQUIRED BINARY str (UTF8);
+ }
+
+ }
+ OPTIONAL GROUP my_list (LIST) {
+ REPEATED GROUP my_list_tuple {
+ REQUIRED BINARY str (UTF8);
+ }
+ }
+ REPEATED INT32 name;
+ }
+ ";
+
+ // // List<String> (list non-null, elements nullable)
+ // required group my_list (LIST) {
+ // repeated group list {
+ // optional binary element (UTF8);
+ // }
+ // }
+ {
+ arrow_fields.push(Field::new(
+ "my_list",
+ DataType::List(Box::new(DataType::Utf8)),
+ false,
+ ));
+ }
+
+ // // List<String> (list nullable, elements non-null)
+ // optional group my_list (LIST) {
+ // repeated group list {
+ // required binary element (UTF8);
+ // }
+ // }
+ {
+ arrow_fields.push(Field::new(
+ "my_list",
+ DataType::List(Box::new(DataType::Utf8)),
+ true,
+ ));
+ }
+
+ // Element types can be nested structures. For example, a list of lists:
+ //
+ // // List<List<Integer>>
+ // optional group array_of_arrays (LIST) {
+ // repeated group list {
+ // required group element (LIST) {
+ // repeated group list {
+ // required int32 element;
+ // }
+ // }
+ // }
+ // }
+ {
+ let arrow_inner_list = DataType::List(Box::new(DataType::Int32));
+ arrow_fields.push(Field::new(
+ "array_of_arrays",
+ DataType::List(Box::new(arrow_inner_list)),
+ true,
+ ));
+ }
+
+ // // List<String> (list nullable, elements non-null)
+ // optional group my_list (LIST) {
+ // repeated group element {
+ // required binary str (UTF8);
+ // };
+ // }
+ {
+ arrow_fields.push(Field::new(
+ "my_list",
+ DataType::List(Box::new(DataType::Utf8)),
+ true,
+ ));
+ }
+
+ // // List<Integer> (nullable list, non-null elements)
+ // optional group my_list (LIST) {
+ // repeated int32 element;
+ // }
+ {
+ arrow_fields.push(Field::new(
+ "my_list",
+ DataType::List(Box::new(DataType::Int32)),
+ true,
+ ));
+ }
+
+ // // List<Tuple<String, Integer>> (nullable list, non-null elements)
+ // optional group my_list (LIST) {
+ // repeated group element {
+ // required binary str (UTF8);
+ // required int32 num;
+ // };
+ // }
+ {
+ let arrow_struct = DataType::Struct(vec![
+ Field::new("str", DataType::Utf8, false),
+ Field::new("num", DataType::Int32, false),
+ ]);
+ arrow_fields.push(Field::new(
+ "my_list",
+ DataType::List(Box::new(arrow_struct)),
+ true,
+ ));
+ }
+
+ // // List<OneTuple<String>> (nullable list, non-null elements)
+ // optional group my_list (LIST) {
+ // repeated group array {
+ // required binary str (UTF8);
+ // };
+ // }
+ // Special case: group is named array
+ {
+ let arrow_struct = DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]);
+ arrow_fields.push(Field::new(
+ "my_list",
+ DataType::List(Box::new(arrow_struct)),
+ true,
+ ));
+ }
+
+ // // List<OneTuple<String>> (nullable list, non-null elements)
+ // optional group my_list (LIST) {
+ // repeated group my_list_tuple {
+ // required binary str (UTF8);
+ // };
+ // }
+ // Special case: group named ends in _tuple
+ {
+ let arrow_struct = DataType::Struct(vec![Field::new("str", DataType::Utf8, false)]);
+ arrow_fields.push(Field::new(
+ "my_list",
+ DataType::List(Box::new(arrow_struct)),
+ true,
+ ));
+ }
+
+ // One-level encoding: Only allows required lists with required cells
+ // repeated value_type name
+ {
+ arrow_fields.push(Field::new(
+ "name",
+ DataType::List(Box::new(DataType::Int32)),
+ true,
+ ));
+ }
+
+ let parquet_group_type = parse_message_type(message_type).unwrap();
+
+ let parquet_schema = Rc::new(SchemaDescriptor::new(Rc::new(parquet_group_type)));
+ let converted_arrow_schema = parquet_to_arrow_schema(parquet_schema.clone()).unwrap();
+ let converted_fields = converted_arrow_schema.fields();
+
+ assert_eq!(arrow_fields.len(), converted_fields.len());
+ for i in 0..arrow_fields.len() {
+ assert_eq!(arrow_fields[i], converted_fields[i]);
+ }
+ }
+
+ #[test]
+ fn test_nested_schema() {
+ let mut arrow_fields = Vec::new();
+ {
+ let group1_fields = vec![
+ Field::new("leaf1", DataType::Boolean, false),
+ Field::new("leaf2", DataType::Int32, false),
+ ];
+ let group1_struct = Field::new("group1", DataType::Struct(group1_fields), false);
+ arrow_fields.push(group1_struct);
+
+ let leaf3_field = Field::new("leaf3", DataType::Int64, false);
+ arrow_fields.push(leaf3_field);
+ }
+
+ let message_type = "
+ message test_schema {
+ REQUIRED GROUP group1 {
+ REQUIRED BOOLEAN leaf1;
+ REQUIRED INT32 leaf2;
+ }
+ REQUIRED INT64 leaf3;
+ }
+ ";
+ let parquet_group_type = parse_message_type(message_type).unwrap();
+
+ let parquet_schema = Rc::new(SchemaDescriptor::new(Rc::new(parquet_group_type)));
+ let converted_arrow_schema = parquet_to_arrow_schema(parquet_schema.clone()).unwrap();
+ let converted_fields = converted_arrow_schema.fields();
+
+ assert_eq!(arrow_fields.len(), converted_fields.len());
+ for i in 0..arrow_fields.len() {
+ assert_eq!(arrow_fields[i], converted_fields[i]);
+ }
+ }
+
+ #[test]
+ fn test_nested_schema_partial() {
+ let mut arrow_fields = Vec::new();
+ {
+ let group1_fields = vec![Field::new("leaf1", DataType::Int64, false)];
+ let group1 = Field::new("group1", DataType::Struct(group1_fields), false);
+ arrow_fields.push(group1);
+
+ let group2_fields = vec![Field::new("leaf4", DataType::Int64, false)];
+ let group2 = Field::new("group2", DataType::Struct(group2_fields), false);
+ arrow_fields.push(group2);
+
+ arrow_fields.push(Field::new("leaf5", DataType::Int64, false));
+ }
+
+ let message_type = "
+ message test_schema {
+ REQUIRED GROUP group1 {
+ REQUIRED INT64 leaf1;
+ REQUIRED INT64 leaf2;
+ }
+ REQUIRED GROUP group2 {
+ REQUIRED INT64 leaf3;
+ REQUIRED INT64 leaf4;
+ }
+ REQUIRED INT64 leaf5;
+ }
+ ";
+ let parquet_group_type = parse_message_type(message_type).unwrap();
+
+ // Expected partial arrow schema (columns 0, 3, 4):
+ // required group group1 {
+ // required int64 leaf1;
+ // }
+ // required group group2 {
+ // required int64 leaf4;
+ // }
+ // required int64 leaf5;
+
+ let parquet_schema = Rc::new(SchemaDescriptor::new(Rc::new(parquet_group_type)));
+ let converted_arrow_schema =
+ parquet_to_arrow_schema_by_columns(parquet_schema.clone(), vec![0, 3, 4]).unwrap();
+ let converted_fields = converted_arrow_schema.fields();
+
+ assert_eq!(arrow_fields.len(), converted_fields.len());
+ for i in 0..arrow_fields.len() {
+ assert_eq!(arrow_fields[i], converted_fields[i]);
+ }
+ }
+
+ #[test]
+ fn test_nested_schema_partial_ordering() {
+ let mut arrow_fields = Vec::new();
+ {
+ let group2_fields = vec![Field::new("leaf4", DataType::Int64, false)];
+ let group2 = Field::new("group2", DataType::Struct(group2_fields), false);
+ arrow_fields.push(group2);
+
+ arrow_fields.push(Field::new("leaf5", DataType::Int64, false));
+
+ let group1_fields = vec![Field::new("leaf1", DataType::Int64, false)];
+ let group1 = Field::new("group1", DataType::Struct(group1_fields), false);
+ arrow_fields.push(group1);
+ }
+
+ let message_type = "
+ message test_schema {
+ REQUIRED GROUP group1 {
+ REQUIRED INT64 leaf1;
+ REQUIRED INT64 leaf2;
+ }
+ REQUIRED GROUP group2 {
+ REQUIRED INT64 leaf3;
+ REQUIRED INT64 leaf4;
+ }
+ REQUIRED INT64 leaf5;
+ }
+ ";
+ let parquet_group_type = parse_message_type(message_type).unwrap();
+
+ // Expected partial arrow schema (columns 3, 4, 0):
+ // required group group1 {
+ // required int64 leaf1;
+ // }
+ // required group group2 {
+ // required int64 leaf4;
+ // }
+ // required int64 leaf5;
+
+ let parquet_schema = Rc::new(SchemaDescriptor::new(Rc::new(parquet_group_type)));
+ let converted_arrow_schema =
+ parquet_to_arrow_schema_by_columns(parquet_schema.clone(), vec![3, 4, 0]).unwrap();
+ let converted_fields = converted_arrow_schema.fields();
+
+ assert_eq!(arrow_fields.len(), converted_fields.len());
+ for i in 0..arrow_fields.len() {
+ assert_eq!(arrow_fields[i], converted_fields[i]);
+ }
+ }
+
+ #[test]
+ fn test_repeated_nested_schema() {
+ let mut arrow_fields = Vec::new();
+ {
+ arrow_fields.push(Field::new("leaf1", DataType::Int32, true));
+
+ let inner_group_list = Field::new(
+ "innerGroup",
+ DataType::List(Box::new(DataType::Struct(vec![Field::new(
+ "leaf3",
+ DataType::Int32,
+ true,
+ )]))),
+ true,
+ );
+
+ let outer_group_list = Field::new(
+ "outerGroup",
+ DataType::List(Box::new(DataType::Struct(vec![
+ Field::new("leaf2", DataType::Int32, true),
+ inner_group_list,
+ ]))),
+ true,
+ );
+ arrow_fields.push(outer_group_list);
+ }
+
+ let message_type = "
+ message test_schema {
+ OPTIONAL INT32 leaf1;
+ REPEATED GROUP outerGroup {
+ OPTIONAL INT32 leaf2;
+ REPEATED GROUP innerGroup {
+ OPTIONAL INT32 leaf3;
+ }
+ }
+ }
+ ";
+ let parquet_group_type = parse_message_type(message_type).unwrap();
+
+ let parquet_schema = Rc::new(SchemaDescriptor::new(Rc::new(parquet_group_type)));
+ let converted_arrow_schema = parquet_to_arrow_schema(parquet_schema.clone()).unwrap();
+ let converted_fields = converted_arrow_schema.fields();
+
+ assert_eq!(arrow_fields.len(), converted_fields.len());
+ for i in 0..arrow_fields.len() {
+ assert_eq!(arrow_fields[i], converted_fields[i]);
+ }
+ }
+}
diff --git a/rust/parquet/src/schema/types.rs b/rust/parquet/src/schema/types.rs
index 30ee9f6..aa314d6 100644
--- a/rust/parquet/src/schema/types.rs
+++ b/rust/parquet/src/schema/types.rs
@@ -741,19 +741,31 @@ impl SchemaDescriptor {
/// Returns column root [`Type`](`::schema::types::Type`) for a field position.
pub fn get_column_root(&self, i: usize) -> &Type {
+ let result = self.column_root_of(i);
+ result.as_ref()
+ }
+
+ /// Returns column root [`Type`](`::schema::types::Type`) pointer for a field position.
+ pub fn get_column_root_ptr(&self, i: usize) -> TypePtr {
+ let result = self.column_root_of(i);
+ result.clone()
+ }
+
+ fn column_root_of(&self, i: usize) -> &Rc<Type> {
assert!(
i < self.leaves.len(),
"Index out of bound: {} not in [0, {})",
i,
self.leaves.len()
);
+
let result = self.leaf_to_base.get(&i);
assert!(
result.is_some(),
"Expected a value for index {} but found None",
i
);
- result.unwrap().as_ref()
+ result.unwrap()
}
/// Returns schema as [`Type`](`::schema::types::Type`).