You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/11/27 22:35:47 UTC

[GitHub] [arrow] nevi-me commented on a change in pull request #8788: ARROW-4544: [Rust] JSON nested struct reader

nevi-me commented on a change in pull request #8788:
URL: https://github.com/apache/arrow/pull/8788#discussion_r531799717



##########
File path: rust/arrow/src/json/reader.rs
##########
@@ -1022,6 +846,228 @@ impl<R: Read> Reader<R> {
         Ok(Arc::new(builder.finish()))
     }
 
+    fn build_struct_array(
+        &self,
+        rows: &[Value],
+        struct_fields: &[Field],
+        projection: &[String],
+    ) -> Result<Vec<ArrayRef>> {
+        let arrays: Result<Vec<ArrayRef>> =
+            struct_fields
+                .iter()
+                .filter(|field| {
+                    if projection.is_empty() {
+                        return true;
+                    }
+                    projection.contains(field.name())
+                })
+                .map(|field| {
+                    match field.data_type() {
+                        DataType::Null => unimplemented!(),
+                        DataType::Boolean => self.build_boolean_array(rows, field.name()),
+                        DataType::Float64 => {
+                            self.build_primitive_array::<Float64Type>(rows, field.name())
+                        }
+                        DataType::Float32 => {
+                            self.build_primitive_array::<Float32Type>(rows, field.name())
+                        }
+                        DataType::Int64 => {
+                            self.build_primitive_array::<Int64Type>(rows, field.name())
+                        }
+                        DataType::Int32 => {
+                            self.build_primitive_array::<Int32Type>(rows, field.name())
+                        }
+                        DataType::Int16 => {
+                            self.build_primitive_array::<Int16Type>(rows, field.name())
+                        }
+                        DataType::Int8 => {
+                            self.build_primitive_array::<Int8Type>(rows, field.name())
+                        }
+                        DataType::UInt64 => {
+                            self.build_primitive_array::<UInt64Type>(rows, field.name())
+                        }
+                        DataType::UInt32 => {
+                            self.build_primitive_array::<UInt32Type>(rows, field.name())
+                        }
+                        DataType::UInt16 => {
+                            self.build_primitive_array::<UInt16Type>(rows, field.name())
+                        }
+                        DataType::UInt8 => {
+                            self.build_primitive_array::<UInt8Type>(rows, field.name())
+                        }
+                        DataType::Timestamp(unit, _) => match unit {
+                            TimeUnit::Second => self
+                                .build_primitive_array::<TimestampSecondType>(
+                                    rows,
+                                    field.name(),
+                                ),
+                            TimeUnit::Microsecond => self
+                                .build_primitive_array::<TimestampMicrosecondType>(
+                                    rows,
+                                    field.name(),
+                                ),
+                            TimeUnit::Millisecond => self
+                                .build_primitive_array::<TimestampMillisecondType>(
+                                    rows,
+                                    field.name(),
+                                ),
+                            TimeUnit::Nanosecond => self
+                                .build_primitive_array::<TimestampNanosecondType>(
+                                    rows,
+                                    field.name(),
+                                ),
+                        },
+                        DataType::Date64(_) => {
+                            self.build_primitive_array::<Date64Type>(rows, field.name())
+                        }
+                        DataType::Date32(_) => {
+                            self.build_primitive_array::<Date32Type>(rows, field.name())
+                        }
+                        DataType::Time64(unit) => match unit {
+                            TimeUnit::Microsecond => self
+                                .build_primitive_array::<Time64MicrosecondType>(
+                                    rows,
+                                    field.name(),
+                                ),
+                            TimeUnit::Nanosecond => self
+                                .build_primitive_array::<Time64NanosecondType>(
+                                    rows,
+                                    field.name(),
+                                ),
+                            _ => unimplemented!(),
+                        },
+                        DataType::Time32(unit) => match unit {
+                            TimeUnit::Second => self
+                                .build_primitive_array::<Time32SecondType>(
+                                    rows,
+                                    field.name(),
+                                ),
+                            TimeUnit::Millisecond => self
+                                .build_primitive_array::<Time32MillisecondType>(
+                                    rows,
+                                    field.name(),
+                                ),
+                            _ => unimplemented!(),
+                        },
+                        DataType::Utf8 => {
+                            let mut builder = StringBuilder::new(rows.len());
+                            for row in rows {
+                                if let Some(value) = row.get(field.name()) {
+                                    if let Some(str_v) = value.as_str() {
+                                        builder.append_value(str_v)?
+                                    } else {
+                                        builder.append(false)?
+                                    }
+                                } else {
+                                    builder.append(false)?
+                                }
+                            }
+                            Ok(Arc::new(builder.finish()) as ArrayRef)
+                        }
+                        DataType::List(ref t) => {
+                            match t.data_type() {
+                                DataType::Int8 => {
+                                    self.build_list_array::<Int8Type>(rows, field.name())
+                                }
+                                DataType::Int16 => {
+                                    self.build_list_array::<Int16Type>(rows, field.name())
+                                }
+                                DataType::Int32 => {
+                                    self.build_list_array::<Int32Type>(rows, field.name())
+                                }
+                                DataType::Int64 => {
+                                    self.build_list_array::<Int64Type>(rows, field.name())
+                                }
+                                DataType::UInt8 => {
+                                    self.build_list_array::<UInt8Type>(rows, field.name())
+                                }
+                                DataType::UInt16 => self
+                                    .build_list_array::<UInt16Type>(rows, field.name()),
+                                DataType::UInt32 => self
+                                    .build_list_array::<UInt32Type>(rows, field.name()),
+                                DataType::UInt64 => self
+                                    .build_list_array::<UInt64Type>(rows, field.name()),
+                                DataType::Float32 => self
+                                    .build_list_array::<Float32Type>(rows, field.name()),
+                                DataType::Float64 => self
+                                    .build_list_array::<Float64Type>(rows, field.name()),
+                                DataType::Null => unimplemented!(),
+                                DataType::Boolean => {
+                                    self.build_boolean_list_array(rows, field.name())
+                                }
+                                ref dtype @ DataType::Utf8 => {
+                                    // UInt64Type passed down below is a fake type for dictionary builder.
+                                    // It is there to make compiler happy.
+                                    self.list_array_string_array_builder::<UInt64Type>(
+                                        &dtype,
+                                        field.name(),
+                                        rows,
+                                    )
+                                }
+                                DataType::Dictionary(ref key_ty, _) => self
+                                    .build_wrapped_list_array(rows, field.name(), key_ty),
+                                ref e => Err(ArrowError::JsonError(format!(
+                            "Data type is currently not supported in a list : {:?}",
+                            e
+                        ))),
+                            }
+                        }
+                        DataType::Dictionary(ref key_ty, ref val_ty) => self
+                            .build_string_dictionary_array(
+                                rows,
+                                field.name(),
+                                key_ty,
+                                val_ty,
+                            ),
+                        DataType::Struct(fields) => {

Review comment:
       This is the main change, we recurse into the `build_struct_array` function

##########
File path: rust/arrow/src/json/reader.rs
##########
@@ -1022,6 +846,228 @@ impl<R: Read> Reader<R> {
         Ok(Arc::new(builder.finish()))
     }
 
+    fn build_struct_array(
+        &self,
+        rows: &[Value],
+        struct_fields: &[Field],
+        projection: &[String],
+    ) -> Result<Vec<ArrayRef>> {
+        let arrays: Result<Vec<ArrayRef>> =
+            struct_fields
+                .iter()
+                .filter(|field| {
+                    if projection.is_empty() {
+                        return true;
+                    }
+                    projection.contains(field.name())
+                })
+                .map(|field| {
+                    match field.data_type() {
+                        DataType::Null => unimplemented!(),
+                        DataType::Boolean => self.build_boolean_array(rows, field.name()),
+                        DataType::Float64 => {
+                            self.build_primitive_array::<Float64Type>(rows, field.name())
+                        }
+                        DataType::Float32 => {
+                            self.build_primitive_array::<Float32Type>(rows, field.name())
+                        }
+                        DataType::Int64 => {
+                            self.build_primitive_array::<Int64Type>(rows, field.name())
+                        }
+                        DataType::Int32 => {
+                            self.build_primitive_array::<Int32Type>(rows, field.name())
+                        }
+                        DataType::Int16 => {
+                            self.build_primitive_array::<Int16Type>(rows, field.name())
+                        }
+                        DataType::Int8 => {
+                            self.build_primitive_array::<Int8Type>(rows, field.name())
+                        }
+                        DataType::UInt64 => {
+                            self.build_primitive_array::<UInt64Type>(rows, field.name())
+                        }
+                        DataType::UInt32 => {
+                            self.build_primitive_array::<UInt32Type>(rows, field.name())
+                        }
+                        DataType::UInt16 => {
+                            self.build_primitive_array::<UInt16Type>(rows, field.name())
+                        }
+                        DataType::UInt8 => {
+                            self.build_primitive_array::<UInt8Type>(rows, field.name())
+                        }
+                        DataType::Timestamp(unit, _) => match unit {
+                            TimeUnit::Second => self
+                                .build_primitive_array::<TimestampSecondType>(
+                                    rows,
+                                    field.name(),
+                                ),
+                            TimeUnit::Microsecond => self
+                                .build_primitive_array::<TimestampMicrosecondType>(
+                                    rows,
+                                    field.name(),
+                                ),
+                            TimeUnit::Millisecond => self
+                                .build_primitive_array::<TimestampMillisecondType>(
+                                    rows,
+                                    field.name(),
+                                ),
+                            TimeUnit::Nanosecond => self
+                                .build_primitive_array::<TimestampNanosecondType>(
+                                    rows,
+                                    field.name(),
+                                ),
+                        },
+                        DataType::Date64(_) => {
+                            self.build_primitive_array::<Date64Type>(rows, field.name())
+                        }
+                        DataType::Date32(_) => {
+                            self.build_primitive_array::<Date32Type>(rows, field.name())
+                        }
+                        DataType::Time64(unit) => match unit {
+                            TimeUnit::Microsecond => self
+                                .build_primitive_array::<Time64MicrosecondType>(
+                                    rows,
+                                    field.name(),
+                                ),
+                            TimeUnit::Nanosecond => self
+                                .build_primitive_array::<Time64NanosecondType>(
+                                    rows,
+                                    field.name(),
+                                ),
+                            _ => unimplemented!(),
+                        },
+                        DataType::Time32(unit) => match unit {
+                            TimeUnit::Second => self
+                                .build_primitive_array::<Time32SecondType>(
+                                    rows,
+                                    field.name(),
+                                ),
+                            TimeUnit::Millisecond => self
+                                .build_primitive_array::<Time32MillisecondType>(
+                                    rows,
+                                    field.name(),
+                                ),
+                            _ => unimplemented!(),
+                        },
+                        DataType::Utf8 => {
+                            let mut builder = StringBuilder::new(rows.len());
+                            for row in rows {
+                                if let Some(value) = row.get(field.name()) {
+                                    if let Some(str_v) = value.as_str() {
+                                        builder.append_value(str_v)?
+                                    } else {
+                                        builder.append(false)?
+                                    }
+                                } else {
+                                    builder.append(false)?
+                                }
+                            }
+                            Ok(Arc::new(builder.finish()) as ArrayRef)
+                        }
+                        DataType::List(ref t) => {
+                            match t.data_type() {
+                                DataType::Int8 => {
+                                    self.build_list_array::<Int8Type>(rows, field.name())
+                                }
+                                DataType::Int16 => {
+                                    self.build_list_array::<Int16Type>(rows, field.name())
+                                }
+                                DataType::Int32 => {
+                                    self.build_list_array::<Int32Type>(rows, field.name())
+                                }
+                                DataType::Int64 => {
+                                    self.build_list_array::<Int64Type>(rows, field.name())
+                                }
+                                DataType::UInt8 => {
+                                    self.build_list_array::<UInt8Type>(rows, field.name())
+                                }
+                                DataType::UInt16 => self
+                                    .build_list_array::<UInt16Type>(rows, field.name()),
+                                DataType::UInt32 => self
+                                    .build_list_array::<UInt32Type>(rows, field.name()),
+                                DataType::UInt64 => self
+                                    .build_list_array::<UInt64Type>(rows, field.name()),
+                                DataType::Float32 => self
+                                    .build_list_array::<Float32Type>(rows, field.name()),
+                                DataType::Float64 => self
+                                    .build_list_array::<Float64Type>(rows, field.name()),
+                                DataType::Null => unimplemented!(),
+                                DataType::Boolean => {
+                                    self.build_boolean_list_array(rows, field.name())
+                                }
+                                ref dtype @ DataType::Utf8 => {
+                                    // UInt64Type passed down below is a fake type for dictionary builder.
+                                    // It is there to make compiler happy.
+                                    self.list_array_string_array_builder::<UInt64Type>(
+                                        &dtype,
+                                        field.name(),
+                                        rows,
+                                    )
+                                }
+                                DataType::Dictionary(ref key_ty, _) => self
+                                    .build_wrapped_list_array(rows, field.name(), key_ty),
+                                ref e => Err(ArrowError::JsonError(format!(
+                            "Data type is currently not supported in a list : {:?}",
+                            e
+                        ))),
+                            }
+                        }
+                        DataType::Dictionary(ref key_ty, ref val_ty) => self
+                            .build_string_dictionary_array(
+                                rows,
+                                field.name(),
+                                key_ty,
+                                val_ty,
+                            ),
+                        DataType::Struct(fields) => {
+                            // TODO: add a check limiting recursion
+                            let len = rows.len();
+                            let num_bytes = bit_util::ceil(len, 8);
+                            let mut null_buffer = MutableBuffer::new(num_bytes)
+                                .with_bitset(num_bytes, false);
+                            let struct_rows = rows
+                                .iter()
+                                .enumerate()
+                                .map(|(i, row)| {
+                                    (
+                                        i,
+                                        row.as_object()
+                                            .map(|v| v.get(field.name()))
+                                            .flatten(),
+                                    )
+                                })
+                                .map(|(i, v)| match v {
+                                    // we want the field as an object, if it's not, we treat as null
+                                    Some(Value::Object(value)) => {
+                                        bit_util::set_bit(null_buffer.data_mut(), i);

Review comment:
       It should be more optimal to set all values as true, then only set the non-objects to false. I was struggling to get this right though, don't know why the test was failing.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org