You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by su...@apache.org on 2019/04/08 03:44:55 UTC
[arrow] branch master updated: ARROW-5126: [Rust] [Parquet] Convert
parquet column desc to arrow data type
This is an automated email from the ASF dual-hosted git repository.
sunchao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 6907d97 ARROW-5126: [Rust] [Parquet] Convert parquet column desc to arrow data type
6907d97 is described below
commit 6907d972c91de654a1ef4eb2ed59e75a89fb6d00
Author: Renjie Liu <li...@gmail.com>
AuthorDate: Sun Apr 7 20:44:42 2019 -0700
ARROW-5126: [Rust] [Parquet] Convert parquet column desc to arrow data type
Add a converter function to convert parquet column schema to arrow field.
Author: Renjie Liu <li...@gmail.com>
Closes #4117 from liurenjie1024/arrow-5126 and squashes the following commits:
8ffa92b4 <Renjie Liu> Add tests for list and temporal types
411db52d <Renjie Liu> Fix doc
5803da68 <Renjie Liu> Add column schema converter
---
rust/parquet/src/reader/schema.rs | 77 +++++++++++++++++++++++++++++++++++++--
rust/parquet/src/schema/types.rs | 5 +++
2 files changed, 78 insertions(+), 4 deletions(-)
diff --git a/rust/parquet/src/reader/schema.rs b/rust/parquet/src/reader/schema.rs
index 3e66c03..06b04bc 100644
--- a/rust/parquet/src/reader/schema.rs
+++ b/rust/parquet/src/reader/schema.rs
@@ -18,7 +18,8 @@
//! Provides API for converting parquet schema to arrow schema and vice versa.
//!
//! The main interfaces for converting parquet schema to arrow schema are
-//! `parquet_to_arrow_schema` and `parquet_to_arrow_schema_by_columns`.
+//! `parquet_to_arrow_schema`, `parquet_to_arrow_schema_by_columns` and
+//! `parquet_to_arrow_field`.
//!
//! The interfaces for converting arrow schema to parquet schema is coming.
@@ -26,7 +27,7 @@ use std::{collections::HashSet, rc::Rc};
use crate::basic::{LogicalType, Repetition, Type as PhysicalType};
use crate::errors::{ParquetError::ArrowError, Result};
-use crate::schema::types::{SchemaDescPtr, Type, TypePtr};
+use crate::schema::types::{ColumnDescPtr, SchemaDescPtr, Type, TypePtr};
use arrow::datatypes::TimeUnit;
use arrow::datatypes::{DataType, DateUnit, Field, Schema};
@@ -72,6 +73,18 @@ where
.map(|fields| Schema::new(fields))
}
+/// Convert parquet column schema to arrow field.
+pub fn parquet_to_arrow_field(parquet_column: ColumnDescPtr) -> Result<Field> {
+ let schema = parquet_column.self_type_ptr();
+
+ let mut leaves = HashSet::new();
+ leaves.insert(parquet_column.self_type() as *const Type);
+
+ ParquetTypeConverter::new(schema, Rc::new(leaves))
+ .to_field()
+ .map(|opt| opt.unwrap())
+}
+
/// This struct is used to group methods and data structures used to convert parquet
/// schema together.
struct ParquetTypeConverter {
@@ -345,9 +358,12 @@ mod tests {
use crate::schema::{parser::parse_message_type, types::SchemaDescriptor};
- use arrow::datatypes::{DataType, Field};
+ use arrow::datatypes::{DataType, DateUnit, Field, TimeUnit};
- use super::{parquet_to_arrow_schema, parquet_to_arrow_schema_by_columns};
+ use super::{
+ parquet_to_arrow_field, parquet_to_arrow_schema,
+ parquet_to_arrow_schema_by_columns,
+ };
#[test]
fn test_flat_primitives() {
@@ -808,4 +824,57 @@ mod tests {
assert_eq!(arrow_fields[i], converted_fields[i]);
}
}
+
+ #[test]
+ fn test_column_desc_to_field() {
+ let message_type = "
+ message test_schema {
+ REQUIRED BOOLEAN boolean;
+ REQUIRED INT32 int8 (INT_8);
+ REQUIRED INT32 int16 (INT_16);
+ REQUIRED INT32 int32;
+ REQUIRED INT64 int64 ;
+ OPTIONAL DOUBLE double;
+ OPTIONAL FLOAT float;
+ OPTIONAL BINARY string (UTF8);
+ REPEATED BOOLEAN bools;
+ OPTIONAL INT32 date (DATE);
+ OPTIONAL INT32 time_milli (TIME_MILLIS);
+ OPTIONAL INT64 time_micro (TIME_MICROS);
+ OPTIONAL INT64 ts_milli (TIMESTAMP_MILLIS);
+ REQUIRED INT64 ts_micro (TIMESTAMP_MICROS);
+ }
+ ";
+ let parquet_group_type = parse_message_type(message_type).unwrap();
+
+ let parquet_schema = SchemaDescriptor::new(Rc::new(parquet_group_type));
+ let converted_arrow_fields = parquet_schema
+ .columns()
+ .iter()
+ .map(|c| parquet_to_arrow_field(c.clone()).unwrap())
+ .collect::<Vec<Field>>();
+
+ let arrow_fields = vec![
+ Field::new("boolean", DataType::Boolean, false),
+ Field::new("int8", DataType::Int8, false),
+ Field::new("int16", DataType::Int16, false),
+ Field::new("int32", DataType::Int32, false),
+ Field::new("int64", DataType::Int64, false),
+ Field::new("double", DataType::Float64, true),
+ Field::new("float", DataType::Float32, true),
+ Field::new("string", DataType::Utf8, true),
+ Field::new("bools", DataType::List(Box::new(DataType::Boolean)), true),
+ Field::new("date", DataType::Date32(DateUnit::Day), true),
+ Field::new("time_milli", DataType::Time32(TimeUnit::Millisecond), true),
+ Field::new("time_micro", DataType::Time64(TimeUnit::Microsecond), true),
+ Field::new("ts_milli", DataType::Timestamp(TimeUnit::Millisecond), true),
+ Field::new(
+ "ts_micro",
+ DataType::Timestamp(TimeUnit::Microsecond),
+ false,
+ ),
+ ];
+
+ assert_eq!(arrow_fields, converted_arrow_fields);
+ }
}
diff --git a/rust/parquet/src/schema/types.rs b/rust/parquet/src/schema/types.rs
index adf6f4e..6dc9304 100644
--- a/rust/parquet/src/schema/types.rs
+++ b/rust/parquet/src/schema/types.rs
@@ -624,6 +624,11 @@ impl ColumnDescriptor {
self.primitive_type.as_ref()
}
+ /// Returns self type [`TypePtr`](`::schema::types::TypePtr`) for this leaf column.
+ pub fn self_type_ptr(&self) -> TypePtr {
+ self.primitive_type.clone()
+ }
+
/// Returns root [`Type`](`::schema::types::Type`) (most top-level parent field) for
/// this leaf column.
pub fn root_type(&self) -> &Type {