You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/10/20 13:57:10 UTC

[GitHub] [arrow] andygrove commented on a change in pull request #8430: ARROW-10249: [Rust] Support nested dictionaries inside list arrays

andygrove commented on a change in pull request #8430:
URL: https://github.com/apache/arrow/pull/8430#discussion_r508527928



##########
File path: rust/arrow/src/json/reader.rs
##########
@@ -612,6 +644,240 @@ impl<R: Read> Reader<R> {
         arrays.and_then(|arr| RecordBatch::try_new(projected_schema, arr).map(Some))
     }
 
+    fn build_wrapped_list_array(
+        &self,
+        rows: &[Value],
+        col_name: &str,
+        key_type: &DataType,
+    ) -> Result<ArrayRef> {
+        match *key_type {
+            DataType::Int8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int8Type>(&dtype, col_name, rows)
+            }
+            DataType::Int16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int16Type>(&dtype, col_name, rows)
+            }
+            DataType::Int32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int32Type>(&dtype, col_name, rows)
+            }
+            DataType::Int64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int64Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt8Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt16Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt32Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt64Type>(&dtype, col_name, rows)
+            }
+            ref e => Err(ArrowError::JsonError(format!(
+                "Data type is currently not supported for dictionaries in list : {:?}",
+                e
+            ))),
+        }
+    }
+
+    #[inline(always)]
+    fn list_array_string_array_builder<DICT_TY>(
+        &self,
+        data_type: &DataType,
+        col_name: &str,
+        rows: &[Value],
+    ) -> Result<ArrayRef>
+    where
+        DICT_TY: ArrowPrimitiveType + ArrowDictionaryKeyType,
+    {
+        let builder: Box<dyn ArrayBuilder> = match data_type {
+            DataType::Utf8 => {
+                let values_builder = StringBuilder::new(rows.len() * 5);
+                Box::new(ListBuilder::new(values_builder))
+            }
+            DataType::Dictionary(_, _) => {
+                let values_builder =
+                    self.build_string_dictionary_builder::<DICT_TY>(rows.len() * 5)?;
+                Box::new(ListBuilder::new(values_builder))
+            }
+            e => {
+                return Err(ArrowError::JsonError(format!(
+                    "Nested list data builder type is not supported: {:?}",
+                    e
+                )))
+            }
+        };
+        let mut builder = Box::leak(builder);
+
+        for row in rows {
+            if let Some(value) = row.get(col_name) {
+                // value can be an array or a scalar
+                let vals: Vec<Option<String>> = if let Value::String(v) = value {
+                    vec![Some(v.to_string())]
+                } else if let Value::Array(n) = value {
+                    n.iter()
+                        .map(|v: &Value| {
+                            if v.is_string() {
+                                Some(v.as_str().unwrap().to_string())
+                            } else if v.is_array() || v.is_object() {
+                                // implicitly drop nested values
+                                // TODO support deep-nesting
+                                None
+                            } else {
+                                Some(v.to_string())
+                            }
+                        })
+                        .collect()
+                } else if let Value::Null = value {
+                    vec![None]
+                } else if !value.is_object() {
+                    vec![Some(value.to_string())]
+                } else {
+                    return Err(ArrowError::JsonError(
+                        "Only scalars are currently supported in JSON arrays".to_string(),
+                    ));
+                };
+
+                // TODO: (vertexclique): APIs of dictionary arrays and others are different. Unify them.
+                match data_type {
+                    DataType::Utf8 => {
+                        let builder: &mut &mut ListBuilder<StringBuilder> = unsafe {
+                            &mut *(&mut builder as *mut &mut dyn ArrayBuilder
+                                as *mut &mut ListBuilder<StringBuilder>)
+                        };
+                        for val in vals {
+                            if let Some(v) = val {
+                                builder.values().append_value(&v)?
+                            } else {
+                                builder.values().append_null()?
+                            };
+                        }
+
+                        // Append to the list
+                        builder.append(true)?;
+                    }
+                    DataType::Dictionary(_, _) => {
+                        let builder: &mut &mut ListBuilder<
+                            StringDictionaryBuilder<DICT_TY>,
+                        > = unsafe {
+                            &mut *(&mut builder as *mut &mut dyn ArrayBuilder
+                                as *mut &mut ListBuilder<
+                                    StringDictionaryBuilder<DICT_TY>,
+                                >)
+                        };
+                        for val in vals {
+                            if let Some(v) = val {
+                                let _ = builder.values().append(&v)?;
+                            } else {
+                                builder.values().append_null()?
+                            };
+                        }
+
+                        // Append to the list
+                        builder.append(true)?;
+                    }
+                    e => {
+                        return Err(ArrowError::JsonError(format!(
+                            "Nested list data builder type is not supported: {:?}",
+                            e
+                        )))
+                    }
+                }
+            }
+        }
+        unsafe { Ok((*Box::from_raw(builder)).finish() as ArrayRef) }

Review comment:
       I share these concerns and agree that if there is a compelling argument for this we need to have it documented.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org