You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/11/27 22:35:47 UTC
[GitHub] [arrow] nevi-me commented on a change in pull request #8788: ARROW-4544: [Rust] JSON nested struct reader
nevi-me commented on a change in pull request #8788:
URL: https://github.com/apache/arrow/pull/8788#discussion_r531799717
##########
File path: rust/arrow/src/json/reader.rs
##########
@@ -1022,6 +846,228 @@ impl<R: Read> Reader<R> {
Ok(Arc::new(builder.finish()))
}
+ fn build_struct_array(
+ &self,
+ rows: &[Value],
+ struct_fields: &[Field],
+ projection: &[String],
+ ) -> Result<Vec<ArrayRef>> {
+ let arrays: Result<Vec<ArrayRef>> =
+ struct_fields
+ .iter()
+ .filter(|field| {
+ if projection.is_empty() {
+ return true;
+ }
+ projection.contains(field.name())
+ })
+ .map(|field| {
+ match field.data_type() {
+ DataType::Null => unimplemented!(),
+ DataType::Boolean => self.build_boolean_array(rows, field.name()),
+ DataType::Float64 => {
+ self.build_primitive_array::<Float64Type>(rows, field.name())
+ }
+ DataType::Float32 => {
+ self.build_primitive_array::<Float32Type>(rows, field.name())
+ }
+ DataType::Int64 => {
+ self.build_primitive_array::<Int64Type>(rows, field.name())
+ }
+ DataType::Int32 => {
+ self.build_primitive_array::<Int32Type>(rows, field.name())
+ }
+ DataType::Int16 => {
+ self.build_primitive_array::<Int16Type>(rows, field.name())
+ }
+ DataType::Int8 => {
+ self.build_primitive_array::<Int8Type>(rows, field.name())
+ }
+ DataType::UInt64 => {
+ self.build_primitive_array::<UInt64Type>(rows, field.name())
+ }
+ DataType::UInt32 => {
+ self.build_primitive_array::<UInt32Type>(rows, field.name())
+ }
+ DataType::UInt16 => {
+ self.build_primitive_array::<UInt16Type>(rows, field.name())
+ }
+ DataType::UInt8 => {
+ self.build_primitive_array::<UInt8Type>(rows, field.name())
+ }
+ DataType::Timestamp(unit, _) => match unit {
+ TimeUnit::Second => self
+ .build_primitive_array::<TimestampSecondType>(
+ rows,
+ field.name(),
+ ),
+ TimeUnit::Microsecond => self
+ .build_primitive_array::<TimestampMicrosecondType>(
+ rows,
+ field.name(),
+ ),
+ TimeUnit::Millisecond => self
+ .build_primitive_array::<TimestampMillisecondType>(
+ rows,
+ field.name(),
+ ),
+ TimeUnit::Nanosecond => self
+ .build_primitive_array::<TimestampNanosecondType>(
+ rows,
+ field.name(),
+ ),
+ },
+ DataType::Date64(_) => {
+ self.build_primitive_array::<Date64Type>(rows, field.name())
+ }
+ DataType::Date32(_) => {
+ self.build_primitive_array::<Date32Type>(rows, field.name())
+ }
+ DataType::Time64(unit) => match unit {
+ TimeUnit::Microsecond => self
+ .build_primitive_array::<Time64MicrosecondType>(
+ rows,
+ field.name(),
+ ),
+ TimeUnit::Nanosecond => self
+ .build_primitive_array::<Time64NanosecondType>(
+ rows,
+ field.name(),
+ ),
+ _ => unimplemented!(),
+ },
+ DataType::Time32(unit) => match unit {
+ TimeUnit::Second => self
+ .build_primitive_array::<Time32SecondType>(
+ rows,
+ field.name(),
+ ),
+ TimeUnit::Millisecond => self
+ .build_primitive_array::<Time32MillisecondType>(
+ rows,
+ field.name(),
+ ),
+ _ => unimplemented!(),
+ },
+ DataType::Utf8 => {
+ let mut builder = StringBuilder::new(rows.len());
+ for row in rows {
+ if let Some(value) = row.get(field.name()) {
+ if let Some(str_v) = value.as_str() {
+ builder.append_value(str_v)?
+ } else {
+ builder.append(false)?
+ }
+ } else {
+ builder.append(false)?
+ }
+ }
+ Ok(Arc::new(builder.finish()) as ArrayRef)
+ }
+ DataType::List(ref t) => {
+ match t.data_type() {
+ DataType::Int8 => {
+ self.build_list_array::<Int8Type>(rows, field.name())
+ }
+ DataType::Int16 => {
+ self.build_list_array::<Int16Type>(rows, field.name())
+ }
+ DataType::Int32 => {
+ self.build_list_array::<Int32Type>(rows, field.name())
+ }
+ DataType::Int64 => {
+ self.build_list_array::<Int64Type>(rows, field.name())
+ }
+ DataType::UInt8 => {
+ self.build_list_array::<UInt8Type>(rows, field.name())
+ }
+ DataType::UInt16 => self
+ .build_list_array::<UInt16Type>(rows, field.name()),
+ DataType::UInt32 => self
+ .build_list_array::<UInt32Type>(rows, field.name()),
+ DataType::UInt64 => self
+ .build_list_array::<UInt64Type>(rows, field.name()),
+ DataType::Float32 => self
+ .build_list_array::<Float32Type>(rows, field.name()),
+ DataType::Float64 => self
+ .build_list_array::<Float64Type>(rows, field.name()),
+ DataType::Null => unimplemented!(),
+ DataType::Boolean => {
+ self.build_boolean_list_array(rows, field.name())
+ }
+ ref dtype @ DataType::Utf8 => {
+ // UInt64Type passed down below is a fake type for dictionary builder.
+ // It is there to make compiler happy.
+ self.list_array_string_array_builder::<UInt64Type>(
+ &dtype,
+ field.name(),
+ rows,
+ )
+ }
+ DataType::Dictionary(ref key_ty, _) => self
+ .build_wrapped_list_array(rows, field.name(), key_ty),
+ ref e => Err(ArrowError::JsonError(format!(
+ "Data type is currently not supported in a list : {:?}",
+ e
+ ))),
+ }
+ }
+ DataType::Dictionary(ref key_ty, ref val_ty) => self
+ .build_string_dictionary_array(
+ rows,
+ field.name(),
+ key_ty,
+ val_ty,
+ ),
+ DataType::Struct(fields) => {
Review comment:
This is the main change, we recurse into the `build_struct_array` function
##########
File path: rust/arrow/src/json/reader.rs
##########
@@ -1022,6 +846,228 @@ impl<R: Read> Reader<R> {
Ok(Arc::new(builder.finish()))
}
+ fn build_struct_array(
+ &self,
+ rows: &[Value],
+ struct_fields: &[Field],
+ projection: &[String],
+ ) -> Result<Vec<ArrayRef>> {
+ let arrays: Result<Vec<ArrayRef>> =
+ struct_fields
+ .iter()
+ .filter(|field| {
+ if projection.is_empty() {
+ return true;
+ }
+ projection.contains(field.name())
+ })
+ .map(|field| {
+ match field.data_type() {
+ DataType::Null => unimplemented!(),
+ DataType::Boolean => self.build_boolean_array(rows, field.name()),
+ DataType::Float64 => {
+ self.build_primitive_array::<Float64Type>(rows, field.name())
+ }
+ DataType::Float32 => {
+ self.build_primitive_array::<Float32Type>(rows, field.name())
+ }
+ DataType::Int64 => {
+ self.build_primitive_array::<Int64Type>(rows, field.name())
+ }
+ DataType::Int32 => {
+ self.build_primitive_array::<Int32Type>(rows, field.name())
+ }
+ DataType::Int16 => {
+ self.build_primitive_array::<Int16Type>(rows, field.name())
+ }
+ DataType::Int8 => {
+ self.build_primitive_array::<Int8Type>(rows, field.name())
+ }
+ DataType::UInt64 => {
+ self.build_primitive_array::<UInt64Type>(rows, field.name())
+ }
+ DataType::UInt32 => {
+ self.build_primitive_array::<UInt32Type>(rows, field.name())
+ }
+ DataType::UInt16 => {
+ self.build_primitive_array::<UInt16Type>(rows, field.name())
+ }
+ DataType::UInt8 => {
+ self.build_primitive_array::<UInt8Type>(rows, field.name())
+ }
+ DataType::Timestamp(unit, _) => match unit {
+ TimeUnit::Second => self
+ .build_primitive_array::<TimestampSecondType>(
+ rows,
+ field.name(),
+ ),
+ TimeUnit::Microsecond => self
+ .build_primitive_array::<TimestampMicrosecondType>(
+ rows,
+ field.name(),
+ ),
+ TimeUnit::Millisecond => self
+ .build_primitive_array::<TimestampMillisecondType>(
+ rows,
+ field.name(),
+ ),
+ TimeUnit::Nanosecond => self
+ .build_primitive_array::<TimestampNanosecondType>(
+ rows,
+ field.name(),
+ ),
+ },
+ DataType::Date64(_) => {
+ self.build_primitive_array::<Date64Type>(rows, field.name())
+ }
+ DataType::Date32(_) => {
+ self.build_primitive_array::<Date32Type>(rows, field.name())
+ }
+ DataType::Time64(unit) => match unit {
+ TimeUnit::Microsecond => self
+ .build_primitive_array::<Time64MicrosecondType>(
+ rows,
+ field.name(),
+ ),
+ TimeUnit::Nanosecond => self
+ .build_primitive_array::<Time64NanosecondType>(
+ rows,
+ field.name(),
+ ),
+ _ => unimplemented!(),
+ },
+ DataType::Time32(unit) => match unit {
+ TimeUnit::Second => self
+ .build_primitive_array::<Time32SecondType>(
+ rows,
+ field.name(),
+ ),
+ TimeUnit::Millisecond => self
+ .build_primitive_array::<Time32MillisecondType>(
+ rows,
+ field.name(),
+ ),
+ _ => unimplemented!(),
+ },
+ DataType::Utf8 => {
+ let mut builder = StringBuilder::new(rows.len());
+ for row in rows {
+ if let Some(value) = row.get(field.name()) {
+ if let Some(str_v) = value.as_str() {
+ builder.append_value(str_v)?
+ } else {
+ builder.append(false)?
+ }
+ } else {
+ builder.append(false)?
+ }
+ }
+ Ok(Arc::new(builder.finish()) as ArrayRef)
+ }
+ DataType::List(ref t) => {
+ match t.data_type() {
+ DataType::Int8 => {
+ self.build_list_array::<Int8Type>(rows, field.name())
+ }
+ DataType::Int16 => {
+ self.build_list_array::<Int16Type>(rows, field.name())
+ }
+ DataType::Int32 => {
+ self.build_list_array::<Int32Type>(rows, field.name())
+ }
+ DataType::Int64 => {
+ self.build_list_array::<Int64Type>(rows, field.name())
+ }
+ DataType::UInt8 => {
+ self.build_list_array::<UInt8Type>(rows, field.name())
+ }
+ DataType::UInt16 => self
+ .build_list_array::<UInt16Type>(rows, field.name()),
+ DataType::UInt32 => self
+ .build_list_array::<UInt32Type>(rows, field.name()),
+ DataType::UInt64 => self
+ .build_list_array::<UInt64Type>(rows, field.name()),
+ DataType::Float32 => self
+ .build_list_array::<Float32Type>(rows, field.name()),
+ DataType::Float64 => self
+ .build_list_array::<Float64Type>(rows, field.name()),
+ DataType::Null => unimplemented!(),
+ DataType::Boolean => {
+ self.build_boolean_list_array(rows, field.name())
+ }
+ ref dtype @ DataType::Utf8 => {
+ // UInt64Type passed down below is a fake type for dictionary builder.
+ // It is there to make compiler happy.
+ self.list_array_string_array_builder::<UInt64Type>(
+ &dtype,
+ field.name(),
+ rows,
+ )
+ }
+ DataType::Dictionary(ref key_ty, _) => self
+ .build_wrapped_list_array(rows, field.name(), key_ty),
+ ref e => Err(ArrowError::JsonError(format!(
+ "Data type is currently not supported in a list : {:?}",
+ e
+ ))),
+ }
+ }
+ DataType::Dictionary(ref key_ty, ref val_ty) => self
+ .build_string_dictionary_array(
+ rows,
+ field.name(),
+ key_ty,
+ val_ty,
+ ),
+ DataType::Struct(fields) => {
+ // TODO: add a check limiting recursion
+ let len = rows.len();
+ let num_bytes = bit_util::ceil(len, 8);
+ let mut null_buffer = MutableBuffer::new(num_bytes)
+ .with_bitset(num_bytes, false);
+ let struct_rows = rows
+ .iter()
+ .enumerate()
+ .map(|(i, row)| {
+ (
+ i,
+ row.as_object()
+ .map(|v| v.get(field.name()))
+ .flatten(),
+ )
+ })
+ .map(|(i, v)| match v {
+ // we want the field as an object, if it's not, we treat as null
+ Some(Value::Object(value)) => {
+ bit_util::set_bit(null_buffer.data_mut(), i);
Review comment:
It should be more optimal to set all values as true, then only set the non-objects to false. I was struggling to get this right though, don't know why the test was failing.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
users@infra.apache.org