You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/10/10 20:31:48 UTC

[GitHub] [arrow] vertexclique opened a new pull request #8430: ARROW-10249 - Support nested dictionaries inside list arrays

vertexclique opened a new pull request #8430:
URL: https://github.com/apache/arrow/pull/8430


   Support nested dictionaries inside list arrays for arrow JSON reader.
   This pr makes reading nested JSON fields a little bit easier by making a single reader method for the list arrays.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] vertexclique commented on a change in pull request #8430: ARROW-10249: [Rust] Support nested dictionaries inside list arrays

Posted by GitBox <gi...@apache.org>.
vertexclique commented on a change in pull request #8430:
URL: https://github.com/apache/arrow/pull/8430#discussion_r503520332



##########
File path: rust/arrow/src/json/reader.rs
##########
@@ -612,6 +644,240 @@ impl<R: Read> Reader<R> {
         arrays.and_then(|arr| RecordBatch::try_new(projected_schema, arr).map(Some))
     }
 
+    fn build_wrapped_list_array(
+        &self,
+        rows: &[Value],
+        col_name: &str,
+        key_type: &DataType,
+    ) -> Result<ArrayRef> {
+        match *key_type {
+            DataType::Int8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int8Type>(&dtype, col_name, rows)
+            }
+            DataType::Int16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int16Type>(&dtype, col_name, rows)
+            }
+            DataType::Int32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int32Type>(&dtype, col_name, rows)
+            }
+            DataType::Int64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int64Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt8Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt16Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt32Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt64Type>(&dtype, col_name, rows)
+            }
+            ref e => Err(ArrowError::JsonError(format!(
+                "Data type is currently not supported for dictionaries in list : {:?}",
+                e
+            ))),
+        }
+    }
+
+    #[inline(always)]
+    fn list_array_string_array_builder<DICT_TY>(
+        &self,
+        data_type: &DataType,
+        col_name: &str,
+        rows: &[Value],
+    ) -> Result<ArrayRef>
+    where
+        DICT_TY: ArrowPrimitiveType + ArrowDictionaryKeyType,
+    {
+        let builder: Box<dyn ArrayBuilder> = match data_type {
+            DataType::Utf8 => {
+                let values_builder = StringBuilder::new(rows.len() * 5);
+                Box::new(ListBuilder::new(values_builder))
+            }
+            DataType::Dictionary(_, _) => {
+                let values_builder =
+                    self.build_string_dictionary_builder::<DICT_TY>(rows.len() * 5)?;
+                Box::new(ListBuilder::new(values_builder))
+            }
+            e => {
+                return Err(ArrowError::JsonError(format!(
+                    "Nested list data builder type is not supported: {:?}",
+                    e
+                )))
+            }
+        };
+        let mut builder = Box::leak(builder);
+
+        for row in rows {
+            if let Some(value) = row.get(col_name) {
+                // value can be an array or a scalar
+                let vals: Vec<Option<String>> = if let Value::String(v) = value {
+                    vec![Some(v.to_string())]
+                } else if let Value::Array(n) = value {
+                    n.iter()
+                        .map(|v: &Value| {
+                            if v.is_string() {
+                                Some(v.as_str().unwrap().to_string())
+                            } else if v.is_array() || v.is_object() {
+                                // implicitly drop nested values
+                                // TODO support deep-nesting
+                                None
+                            } else {
+                                Some(v.to_string())
+                            }
+                        })
+                        .collect()
+                } else if let Value::Null = value {
+                    vec![None]
+                } else if !value.is_object() {
+                    vec![Some(value.to_string())]
+                } else {
+                    return Err(ArrowError::JsonError(
+                        "Only scalars are currently supported in JSON arrays".to_string(),
+                    ));
+                };
+
+                // TODO: (vertexclique): APIs of dictionary arrays and others are different. Unify them.
+                match data_type {
+                    DataType::Utf8 => {
+                        let builder: &mut &mut ListBuilder<StringBuilder> = unsafe {
+                            &mut *(&mut builder as *mut &mut dyn ArrayBuilder
+                                as *mut &mut ListBuilder<StringBuilder>)
+                        };
+                        for val in vals {
+                            if let Some(v) = val {
+                                builder.values().append_value(&v)?
+                            } else {
+                                builder.values().append_null()?
+                            };
+                        }
+
+                        // Amend to the list
+                        builder.append(true)?;

Review comment:
       append it is.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] jorgecarleitao closed pull request #8430: ARROW-10249: [Rust] Support nested dictionaries inside list arrays

Posted by GitBox <gi...@apache.org>.
jorgecarleitao closed pull request #8430:
URL: https://github.com/apache/arrow/pull/8430


   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] vertexclique commented on pull request #8430: ARROW-10249: [Rust] Support nested dictionaries inside list arrays

Posted by GitBox <gi...@apache.org>.
vertexclique commented on pull request #8430:
URL: https://github.com/apache/arrow/pull/8430#issuecomment-714323287


   > (this part is regardless, and could fit in a future PR) You mentioned in the commit message that this drastically reduces performance. Did you quantified how much was that, or is there any benchmark that you have in mind that I can run on my own? I am asking that because that is one of the situations where unsafe can be considered based on the risks and maintenance vs performance benefits.
   
   Based on my estimations of how I wrote it, it will increase the parsing time for long record windows with large nested element size. I don't estimate how large will cause it but the main concern here is that refcell's stacked borrow and finalizer's dynamic dispatch. This can be decreased by pruning refcell by visiting the ticket that I mentioned in there to unify the interfaces. Then we will use dynamic dispatch in everywhere. Thou, dynamic dispatch can be slightly slower compared to pointer casts(or in-place transmutes) it can solve the overhead of refcell. This piece of code is open to revisiting after [ARROW-10335](https://issues.apache.org/jira/browse/ARROW-10335).


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] vertexclique commented on pull request #8430: ARROW-10249: [Rust] Support nested dictionaries inside list arrays

Posted by GitBox <gi...@apache.org>.
vertexclique commented on pull request #8430:
URL: https://github.com/apache/arrow/pull/8430#issuecomment-712135918


   @jorgecarleitao @nevi-me  Can I push this out of the door peeps? I am kind of blocked by this atm.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] vertexclique commented on a change in pull request #8430: ARROW-10249: [Rust] Support nested dictionaries inside list arrays

Posted by GitBox <gi...@apache.org>.
vertexclique commented on a change in pull request #8430:
URL: https://github.com/apache/arrow/pull/8430#discussion_r503529865



##########
File path: rust/arrow/src/json/reader.rs
##########
@@ -612,6 +644,240 @@ impl<R: Read> Reader<R> {
         arrays.and_then(|arr| RecordBatch::try_new(projected_schema, arr).map(Some))
     }
 
+    fn build_wrapped_list_array(
+        &self,
+        rows: &[Value],
+        col_name: &str,
+        key_type: &DataType,
+    ) -> Result<ArrayRef> {
+        match *key_type {
+            DataType::Int8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int8Type>(&dtype, col_name, rows)
+            }
+            DataType::Int16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int16Type>(&dtype, col_name, rows)
+            }
+            DataType::Int32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int32Type>(&dtype, col_name, rows)
+            }
+            DataType::Int64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int64Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt8Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt16Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt32Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt64Type>(&dtype, col_name, rows)
+            }
+            ref e => Err(ArrowError::JsonError(format!(
+                "Data type is currently not supported for dictionaries in list : {:?}",
+                e
+            ))),
+        }
+    }
+
+    #[inline(always)]
+    fn list_array_string_array_builder<DICT_TY>(

Review comment:
       Nope, it is inside the loop where the `builder` is carried over from the previous loop. If we apply recursion for nested structures sometime later, I want to keep the pointer juggling inside a single block.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] vertexclique commented on pull request #8430: ARROW-10249: [Rust] Support nested dictionaries inside list arrays

Posted by GitBox <gi...@apache.org>.
vertexclique commented on pull request #8430:
URL: https://github.com/apache/arrow/pull/8430#issuecomment-707341797


   @nevi-me 
   
   > I didn't do a detailed review, but I'm happy with the changes. It's been a while since I looked at the JSON reader, how hard/easy do you think it would be for us to support the outstanding work on https://issues.apache.org/jira/browse/ARROW-4534?
   
   Thanks! Especially on the nested reading part (https://issues.apache.org/jira/browse/ARROW-4544), it would be nice to reuse builders at entry. Having a recursive reader with a `recursion_limit` set would be good to go. If we go down into the iterative approach, we will explicitly generate a macro to expand on the compile-time with a depth embedded in. That might slow down to compile times and create larger binaries.
   
   The good part of the recursive approach is that it will be limited by the stack size (but there might be growing stack implementation), where the user can increase this by hand. The bad part is that the recursion limit we have defined shouldn't hit to default stack size, and we should have a sweet spot for it.
   
   About the other ticket that is still open (https://issues.apache.org/jira/browse/ARROW-4803). Type inference for schema might be hard at first. Although it is hard, we can do assumption based parsing by parsing that first(or +2) record's data to infer the type. But when the type is given, we can try to parse all down in iso format.
   
   > I also have the feeling that the reader might be slower than other readers. What has been your experience @vertexclique?
   
   I didn't test the performance, since I was using this in tests, I needed it. We can create a benchmark for r/w, maybe? wdyt?


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] vertexclique commented on pull request #8430: ARROW-10249: [Rust] Support nested dictionaries inside list arrays

Posted by GitBox <gi...@apache.org>.
vertexclique commented on pull request #8430:
URL: https://github.com/apache/arrow/pull/8430#issuecomment-711318388


   @jorgecarleitao written docs.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] vertexclique commented on a change in pull request #8430: ARROW-10249: [Rust] Support nested dictionaries inside list arrays

Posted by GitBox <gi...@apache.org>.
vertexclique commented on a change in pull request #8430:
URL: https://github.com/apache/arrow/pull/8430#discussion_r503288791



##########
File path: rust/arrow/src/json/reader.rs
##########
@@ -612,6 +644,240 @@ impl<R: Read> Reader<R> {
         arrays.and_then(|arr| RecordBatch::try_new(projected_schema, arr).map(Some))
     }
 
+    fn build_wrapped_list_array(
+        &self,
+        rows: &[Value],
+        col_name: &str,
+        key_type: &DataType,
+    ) -> Result<ArrayRef> {
+        match *key_type {
+            DataType::Int8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int8Type>(&dtype, col_name, rows)
+            }
+            DataType::Int16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int16Type>(&dtype, col_name, rows)
+            }
+            DataType::Int32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int32Type>(&dtype, col_name, rows)
+            }
+            DataType::Int64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int64Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt8Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt16Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt32Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt64Type>(&dtype, col_name, rows)
+            }
+            ref e => Err(ArrowError::JsonError(format!(
+                "Data type is currently not supported for dictionaries in list : {:?}",
+                e
+            ))),
+        }
+    }
+
+    #[inline(always)]
+    fn list_array_string_array_builder<DICT_TY>(

Review comment:
       > I think splitting this method into two, one for plain Utf8 and one for dictionary encoded, would simplify the code and should allow it to work without dynamic dispatch.
   
   There is no dynamic dispatch going on here. dyn ArrayBuilder is converted down to the concrete type. Dynamic dispatch is `<T as ArrayBuilder>::finish()`. That doesn't exist here.
   
   > There is only a bit of common code converting the json into a vec (line 749) which could be extracted and reused.
   
   If extracted, loop contains different builders with different build methods as mentioned in:
   https://github.com/apache/arrow/pull/8430/files#diff-3fa6fc3c0ee201f5a3a1a5d25d0062ffR775
   So that wouldn't work, because you need code duplication with the same parameters of this method, which doesn't bring any value again.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] jhorstmann commented on a change in pull request #8430: ARROW-10249: [Rust] Support nested dictionaries inside list arrays

Posted by GitBox <gi...@apache.org>.
jhorstmann commented on a change in pull request #8430:
URL: https://github.com/apache/arrow/pull/8430#discussion_r503270914



##########
File path: rust/arrow/src/json/reader.rs
##########
@@ -612,6 +644,240 @@ impl<R: Read> Reader<R> {
         arrays.and_then(|arr| RecordBatch::try_new(projected_schema, arr).map(Some))
     }
 
+    fn build_wrapped_list_array(
+        &self,
+        rows: &[Value],
+        col_name: &str,
+        key_type: &DataType,
+    ) -> Result<ArrayRef> {
+        match *key_type {
+            DataType::Int8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int8Type>(&dtype, col_name, rows)
+            }
+            DataType::Int16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int16Type>(&dtype, col_name, rows)
+            }
+            DataType::Int32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int32Type>(&dtype, col_name, rows)
+            }
+            DataType::Int64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int64Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt8Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt16Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt32Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt64Type>(&dtype, col_name, rows)
+            }
+            ref e => Err(ArrowError::JsonError(format!(
+                "Data type is currently not supported for dictionaries in list : {:?}",
+                e
+            ))),
+        }
+    }
+
+    #[inline(always)]
+    fn list_array_string_array_builder<DICT_TY>(

Review comment:
       I think splitting this method into two, one for plain Utf8 and one for dictionary encoded, would simplify the code and should allow it to work without dynamic dispatch. There is only a bit of common code converting the json into a vec (line 749) which could be extracted and reused.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] vertexclique commented on pull request #8430: ARROW-10249: [Rust] Support nested dictionaries inside list arrays

Posted by GitBox <gi...@apache.org>.
vertexclique commented on pull request #8430:
URL: https://github.com/apache/arrow/pull/8430#issuecomment-707074594


   @andygrove @paddyhoran @nevi-me Can I get a pair of eyes here?


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] jhorstmann commented on a change in pull request #8430: ARROW-10249: [Rust] Support nested dictionaries inside list arrays

Posted by GitBox <gi...@apache.org>.
jhorstmann commented on a change in pull request #8430:
URL: https://github.com/apache/arrow/pull/8430#discussion_r503372369



##########
File path: rust/arrow/src/json/reader.rs
##########
@@ -612,6 +644,240 @@ impl<R: Read> Reader<R> {
         arrays.and_then(|arr| RecordBatch::try_new(projected_schema, arr).map(Some))
     }
 
+    fn build_wrapped_list_array(
+        &self,
+        rows: &[Value],
+        col_name: &str,
+        key_type: &DataType,
+    ) -> Result<ArrayRef> {
+        match *key_type {
+            DataType::Int8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int8Type>(&dtype, col_name, rows)
+            }
+            DataType::Int16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int16Type>(&dtype, col_name, rows)
+            }
+            DataType::Int32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int32Type>(&dtype, col_name, rows)
+            }
+            DataType::Int64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int64Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt8Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt16Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt32Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt64Type>(&dtype, col_name, rows)
+            }
+            ref e => Err(ArrowError::JsonError(format!(
+                "Data type is currently not supported for dictionaries in list : {:?}",
+                e
+            ))),
+        }
+    }
+
+    #[inline(always)]
+    fn list_array_string_array_builder<DICT_TY>(

Review comment:
       Ok, but splitting the function would allow to remove the Box::leak and unsafe pointer casting, no?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] vertexclique commented on pull request #8430: ARROW-10249: [Rust] Support nested dictionaries inside list arrays

Posted by GitBox <gi...@apache.org>.
vertexclique commented on pull request #8430:
URL: https://github.com/apache/arrow/pull/8430#issuecomment-712937618


   @andygrove 
   
   > @vertexclique When I started contributing here I had similar feelings but there is no reason you should be blocked. You can maintain a branch in your fork with all of your contributions that you need and have a git dependency on that. You could even vendor the code in your project if you need to release to crates.io
   
   > Many/most of us here are volunteers and reviewing PRs in our free time so we can't always be responsive when we have work commitments.
   
   Sorry if I caused a disturbance. I mean no disturbance here. We are in the same shoes altogether, since most of us are having work commitments. If I felt anyone uneasy, I don't mean that. Thanks for the friendly reminder.


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] andygrove commented on pull request #8430: ARROW-10249: [Rust] Support nested dictionaries inside list arrays

Posted by GitBox <gi...@apache.org>.
andygrove commented on pull request #8430:
URL: https://github.com/apache/arrow/pull/8430#issuecomment-712875985


   > @jorgecarleitao @nevi-me Can I push this out of the door peeps? I am kind of blocked by this atm.
   
   @vertexclique  When I started contributing here I had similar feelings but there is no reason you should be blocked. You can maintain a branch in your fork with all of your contributions that you need and have a git dependency on that. You could even vendor the code in your project if you need to release to crates.io
   
   Many/most of us here are volunteers and reviewing PRs in our free time so we can't always be responsive when we have work commitments.
   
   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] jorgecarleitao commented on pull request #8430: ARROW-10249: [Rust] Support nested dictionaries inside list arrays

Posted by GitBox <gi...@apache.org>.
jorgecarleitao commented on pull request #8430:
URL: https://github.com/apache/arrow/pull/8430#issuecomment-712369692


   @vertexclique , I think that the feature is appreciated, but the implementation shows some underlying issue with the API to build arrays.
   
   IMO, In light of this PR we should revisit the API before merging this to ensure that we can create nested structures without `unsafe`, and, if we need an `unsafe`, that we use it in a well-defined, limited scope.
   
   If we accept this, IMO we are opening the door to all kind of usage of `unsafe` in high-level APIs, which is not only an anti-pattern in Rust, but also discouraged except in well defined use-cases, as it risks UB. Note that I am not criticizing this implementation in particular, but that if anyone changes this, it requires a significant care by everyone involved to ensure no UB.
   
   In other words, this implementation adds technical debt that I (and I alone) will be unable to pay back, which means that I cannot commit at maintaining this change. I would obviously not block others from approving it and merging if they feel that they can maintain this change.
   
   cc @paddyhoran
   


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] jorgecarleitao commented on a change in pull request #8430: ARROW-10249: [Rust] Support nested dictionaries inside list arrays

Posted by GitBox <gi...@apache.org>.
jorgecarleitao commented on a change in pull request #8430:
URL: https://github.com/apache/arrow/pull/8430#discussion_r504385888



##########
File path: rust/arrow/src/json/reader.rs
##########
@@ -612,6 +644,240 @@ impl<R: Read> Reader<R> {
         arrays.and_then(|arr| RecordBatch::try_new(projected_schema, arr).map(Some))
     }
 
+    fn build_wrapped_list_array(
+        &self,
+        rows: &[Value],
+        col_name: &str,
+        key_type: &DataType,
+    ) -> Result<ArrayRef> {
+        match *key_type {
+            DataType::Int8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int8Type>(&dtype, col_name, rows)
+            }
+            DataType::Int16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int16Type>(&dtype, col_name, rows)
+            }
+            DataType::Int32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int32Type>(&dtype, col_name, rows)
+            }
+            DataType::Int64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int64Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt8Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt16Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt32Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt64Type>(&dtype, col_name, rows)
+            }
+            ref e => Err(ArrowError::JsonError(format!(
+                "Data type is currently not supported for dictionaries in list : {:?}",
+                e
+            ))),
+        }
+    }
+
+    #[inline(always)]
+    fn list_array_string_array_builder<DICT_TY>(
+        &self,
+        data_type: &DataType,
+        col_name: &str,
+        rows: &[Value],
+    ) -> Result<ArrayRef>
+    where
+        DICT_TY: ArrowPrimitiveType + ArrowDictionaryKeyType,
+    {
+        let builder: Box<dyn ArrayBuilder> = match data_type {
+            DataType::Utf8 => {
+                let values_builder = StringBuilder::new(rows.len() * 5);
+                Box::new(ListBuilder::new(values_builder))
+            }
+            DataType::Dictionary(_, _) => {
+                let values_builder =
+                    self.build_string_dictionary_builder::<DICT_TY>(rows.len() * 5)?;
+                Box::new(ListBuilder::new(values_builder))
+            }
+            e => {
+                return Err(ArrowError::JsonError(format!(
+                    "Nested list data builder type is not supported: {:?}",
+                    e
+                )))
+            }
+        };
+        let mut builder = Box::leak(builder);
+
+        for row in rows {
+            if let Some(value) = row.get(col_name) {
+                // value can be an array or a scalar
+                let vals: Vec<Option<String>> = if let Value::String(v) = value {
+                    vec![Some(v.to_string())]
+                } else if let Value::Array(n) = value {
+                    n.iter()
+                        .map(|v: &Value| {
+                            if v.is_string() {
+                                Some(v.as_str().unwrap().to_string())
+                            } else if v.is_array() || v.is_object() {
+                                // implicitly drop nested values
+                                // TODO support deep-nesting
+                                None
+                            } else {
+                                Some(v.to_string())
+                            }
+                        })
+                        .collect()
+                } else if let Value::Null = value {
+                    vec![None]
+                } else if !value.is_object() {
+                    vec![Some(value.to_string())]
+                } else {
+                    return Err(ArrowError::JsonError(
+                        "Only scalars are currently supported in JSON arrays".to_string(),
+                    ));
+                };
+
+                // TODO: (vertexclique): APIs of dictionary arrays and others are different. Unify them.
+                match data_type {
+                    DataType::Utf8 => {
+                        let builder: &mut &mut ListBuilder<StringBuilder> = unsafe {
+                            &mut *(&mut builder as *mut &mut dyn ArrayBuilder
+                                as *mut &mut ListBuilder<StringBuilder>)
+                        };
+                        for val in vals {
+                            if let Some(v) = val {
+                                builder.values().append_value(&v)?
+                            } else {
+                                builder.values().append_null()?
+                            };
+                        }
+
+                        // Append to the list
+                        builder.append(true)?;
+                    }
+                    DataType::Dictionary(_, _) => {
+                        let builder: &mut &mut ListBuilder<
+                            StringDictionaryBuilder<DICT_TY>,
+                        > = unsafe {

Review comment:
       Same here.

##########
File path: rust/arrow/src/json/reader.rs
##########
@@ -612,6 +644,240 @@ impl<R: Read> Reader<R> {
         arrays.and_then(|arr| RecordBatch::try_new(projected_schema, arr).map(Some))
     }
 
+    fn build_wrapped_list_array(
+        &self,
+        rows: &[Value],
+        col_name: &str,
+        key_type: &DataType,
+    ) -> Result<ArrayRef> {
+        match *key_type {
+            DataType::Int8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int8Type>(&dtype, col_name, rows)
+            }
+            DataType::Int16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int16Type>(&dtype, col_name, rows)
+            }
+            DataType::Int32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int32Type>(&dtype, col_name, rows)
+            }
+            DataType::Int64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int64Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt8Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt16Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt32Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt64Type>(&dtype, col_name, rows)
+            }
+            ref e => Err(ArrowError::JsonError(format!(
+                "Data type is currently not supported for dictionaries in list : {:?}",
+                e
+            ))),
+        }
+    }
+
+    #[inline(always)]
+    fn list_array_string_array_builder<DICT_TY>(
+        &self,
+        data_type: &DataType,
+        col_name: &str,
+        rows: &[Value],
+    ) -> Result<ArrayRef>
+    where
+        DICT_TY: ArrowPrimitiveType + ArrowDictionaryKeyType,
+    {
+        let builder: Box<dyn ArrayBuilder> = match data_type {
+            DataType::Utf8 => {
+                let values_builder = StringBuilder::new(rows.len() * 5);
+                Box::new(ListBuilder::new(values_builder))
+            }
+            DataType::Dictionary(_, _) => {
+                let values_builder =
+                    self.build_string_dictionary_builder::<DICT_TY>(rows.len() * 5)?;
+                Box::new(ListBuilder::new(values_builder))
+            }
+            e => {
+                return Err(ArrowError::JsonError(format!(
+                    "Nested list data builder type is not supported: {:?}",
+                    e
+                )))
+            }
+        };
+        let mut builder = Box::leak(builder);
+
+        for row in rows {
+            if let Some(value) = row.get(col_name) {
+                // value can be an array or a scalar
+                let vals: Vec<Option<String>> = if let Value::String(v) = value {
+                    vec![Some(v.to_string())]
+                } else if let Value::Array(n) = value {
+                    n.iter()
+                        .map(|v: &Value| {
+                            if v.is_string() {
+                                Some(v.as_str().unwrap().to_string())
+                            } else if v.is_array() || v.is_object() {
+                                // implicitly drop nested values
+                                // TODO support deep-nesting
+                                None
+                            } else {
+                                Some(v.to_string())
+                            }
+                        })
+                        .collect()
+                } else if let Value::Null = value {
+                    vec![None]
+                } else if !value.is_object() {
+                    vec![Some(value.to_string())]
+                } else {
+                    return Err(ArrowError::JsonError(
+                        "Only scalars are currently supported in JSON arrays".to_string(),
+                    ));
+                };
+
+                // TODO: (vertexclique): APIs of dictionary arrays and others are different. Unify them.
+                match data_type {
+                    DataType::Utf8 => {
+                        let builder: &mut &mut ListBuilder<StringBuilder> = unsafe {
+                            &mut *(&mut builder as *mut &mut dyn ArrayBuilder
+                                as *mut &mut ListBuilder<StringBuilder>)
+                        };
+                        for val in vals {
+                            if let Some(v) = val {
+                                builder.values().append_value(&v)?
+                            } else {
+                                builder.values().append_null()?
+                            };
+                        }
+
+                        // Append to the list
+                        builder.append(true)?;
+                    }
+                    DataType::Dictionary(_, _) => {
+                        let builder: &mut &mut ListBuilder<
+                            StringDictionaryBuilder<DICT_TY>,
+                        > = unsafe {
+                            &mut *(&mut builder as *mut &mut dyn ArrayBuilder
+                                as *mut &mut ListBuilder<
+                                    StringDictionaryBuilder<DICT_TY>,
+                                >)
+                        };
+                        for val in vals {
+                            if let Some(v) = val {
+                                let _ = builder.values().append(&v)?;
+                            } else {
+                                builder.values().append_null()?
+                            };
+                        }
+
+                        // Append to the list
+                        builder.append(true)?;
+                    }
+                    e => {
+                        return Err(ArrowError::JsonError(format!(
+                            "Nested list data builder type is not supported: {:?}",
+                            e
+                        )))
+                    }
+                }
+            }
+        }
+        unsafe { Ok((*Box::from_raw(builder)).finish() as ArrayRef) }

Review comment:
       why do we need to break lifetime rules here with `Box::leak`? Isn't there a safe approach to this? If this is required, can we comment why this is required and why other approaches do not?
   

##########
File path: rust/arrow/src/json/reader.rs
##########
@@ -612,6 +644,240 @@ impl<R: Read> Reader<R> {
         arrays.and_then(|arr| RecordBatch::try_new(projected_schema, arr).map(Some))
     }
 
+    fn build_wrapped_list_array(
+        &self,
+        rows: &[Value],
+        col_name: &str,
+        key_type: &DataType,
+    ) -> Result<ArrayRef> {
+        match *key_type {
+            DataType::Int8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int8Type>(&dtype, col_name, rows)
+            }
+            DataType::Int16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int16Type>(&dtype, col_name, rows)
+            }
+            DataType::Int32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int32Type>(&dtype, col_name, rows)
+            }
+            DataType::Int64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int64Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt8Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt16Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt32Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt64Type>(&dtype, col_name, rows)
+            }
+            ref e => Err(ArrowError::JsonError(format!(
+                "Data type is currently not supported for dictionaries in list : {:?}",
+                e
+            ))),
+        }
+    }
+
+    #[inline(always)]
+    fn list_array_string_array_builder<DICT_TY>(
+        &self,
+        data_type: &DataType,
+        col_name: &str,
+        rows: &[Value],
+    ) -> Result<ArrayRef>
+    where
+        DICT_TY: ArrowPrimitiveType + ArrowDictionaryKeyType,
+    {
+        let builder: Box<dyn ArrayBuilder> = match data_type {
+            DataType::Utf8 => {
+                let values_builder = StringBuilder::new(rows.len() * 5);
+                Box::new(ListBuilder::new(values_builder))
+            }
+            DataType::Dictionary(_, _) => {
+                let values_builder =
+                    self.build_string_dictionary_builder::<DICT_TY>(rows.len() * 5)?;
+                Box::new(ListBuilder::new(values_builder))
+            }
+            e => {
+                return Err(ArrowError::JsonError(format!(
+                    "Nested list data builder type is not supported: {:?}",
+                    e
+                )))
+            }
+        };
+        let mut builder = Box::leak(builder);

Review comment:
       IMO this needs documentation. As it stands, it is hard to understand why this is being done.

##########
File path: rust/arrow/src/json/reader.rs
##########
@@ -612,6 +644,240 @@ impl<R: Read> Reader<R> {
         arrays.and_then(|arr| RecordBatch::try_new(projected_schema, arr).map(Some))
     }
 
+    fn build_wrapped_list_array(
+        &self,
+        rows: &[Value],
+        col_name: &str,
+        key_type: &DataType,
+    ) -> Result<ArrayRef> {
+        match *key_type {
+            DataType::Int8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int8Type>(&dtype, col_name, rows)
+            }
+            DataType::Int16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int16Type>(&dtype, col_name, rows)
+            }
+            DataType::Int32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int32Type>(&dtype, col_name, rows)
+            }
+            DataType::Int64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int64Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt8Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt16Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt32Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt64Type>(&dtype, col_name, rows)
+            }
+            ref e => Err(ArrowError::JsonError(format!(
+                "Data type is currently not supported for dictionaries in list : {:?}",
+                e
+            ))),
+        }
+    }
+
+    #[inline(always)]
+    fn list_array_string_array_builder<DICT_TY>(
+        &self,
+        data_type: &DataType,
+        col_name: &str,
+        rows: &[Value],
+    ) -> Result<ArrayRef>
+    where
+        DICT_TY: ArrowPrimitiveType + ArrowDictionaryKeyType,
+    {
+        let builder: Box<dyn ArrayBuilder> = match data_type {
+            DataType::Utf8 => {
+                let values_builder = StringBuilder::new(rows.len() * 5);
+                Box::new(ListBuilder::new(values_builder))
+            }
+            DataType::Dictionary(_, _) => {
+                let values_builder =
+                    self.build_string_dictionary_builder::<DICT_TY>(rows.len() * 5)?;
+                Box::new(ListBuilder::new(values_builder))
+            }
+            e => {
+                return Err(ArrowError::JsonError(format!(
+                    "Nested list data builder type is not supported: {:?}",
+                    e
+                )))
+            }
+        };
+        let mut builder = Box::leak(builder);
+
+        for row in rows {
+            if let Some(value) = row.get(col_name) {
+                // value can be an array or a scalar
+                let vals: Vec<Option<String>> = if let Value::String(v) = value {
+                    vec![Some(v.to_string())]
+                } else if let Value::Array(n) = value {
+                    n.iter()
+                        .map(|v: &Value| {
+                            if v.is_string() {
+                                Some(v.as_str().unwrap().to_string())
+                            } else if v.is_array() || v.is_object() {
+                                // implicitly drop nested values
+                                // TODO support deep-nesting
+                                None
+                            } else {
+                                Some(v.to_string())
+                            }
+                        })
+                        .collect()
+                } else if let Value::Null = value {
+                    vec![None]
+                } else if !value.is_object() {
+                    vec![Some(value.to_string())]
+                } else {
+                    return Err(ArrowError::JsonError(
+                        "Only scalars are currently supported in JSON arrays".to_string(),
+                    ));
+                };
+
+                // TODO: (vertexclique): APIs of dictionary arrays and others are different. Unify them.

Review comment:
       I think we could rely on ARROW- issues to document `todo`s instead of user handles.

##########
File path: rust/arrow/src/json/reader.rs
##########
@@ -612,6 +644,240 @@ impl<R: Read> Reader<R> {
         arrays.and_then(|arr| RecordBatch::try_new(projected_schema, arr).map(Some))
     }
 
+    fn build_wrapped_list_array(
+        &self,
+        rows: &[Value],
+        col_name: &str,
+        key_type: &DataType,
+    ) -> Result<ArrayRef> {
+        match *key_type {
+            DataType::Int8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int8Type>(&dtype, col_name, rows)
+            }
+            DataType::Int16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int16Type>(&dtype, col_name, rows)
+            }
+            DataType::Int32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int32Type>(&dtype, col_name, rows)
+            }
+            DataType::Int64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int64Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt8Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt16Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt32Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt64Type>(&dtype, col_name, rows)
+            }
+            ref e => Err(ArrowError::JsonError(format!(
+                "Data type is currently not supported for dictionaries in list : {:?}",
+                e
+            ))),
+        }
+    }
+
+    #[inline(always)]
+    fn list_array_string_array_builder<DICT_TY>(
+        &self,
+        data_type: &DataType,
+        col_name: &str,
+        rows: &[Value],
+    ) -> Result<ArrayRef>
+    where
+        DICT_TY: ArrowPrimitiveType + ArrowDictionaryKeyType,
+    {
+        let builder: Box<dyn ArrayBuilder> = match data_type {
+            DataType::Utf8 => {
+                let values_builder = StringBuilder::new(rows.len() * 5);
+                Box::new(ListBuilder::new(values_builder))
+            }
+            DataType::Dictionary(_, _) => {
+                let values_builder =
+                    self.build_string_dictionary_builder::<DICT_TY>(rows.len() * 5)?;
+                Box::new(ListBuilder::new(values_builder))
+            }
+            e => {
+                return Err(ArrowError::JsonError(format!(
+                    "Nested list data builder type is not supported: {:?}",
+                    e
+                )))
+            }
+        };
+        let mut builder = Box::leak(builder);
+
+        for row in rows {
+            if let Some(value) = row.get(col_name) {
+                // value can be an array or a scalar
+                let vals: Vec<Option<String>> = if let Value::String(v) = value {
+                    vec![Some(v.to_string())]
+                } else if let Value::Array(n) = value {
+                    n.iter()
+                        .map(|v: &Value| {
+                            if v.is_string() {
+                                Some(v.as_str().unwrap().to_string())
+                            } else if v.is_array() || v.is_object() {
+                                // implicitly drop nested values
+                                // TODO support deep-nesting
+                                None
+                            } else {
+                                Some(v.to_string())
+                            }
+                        })
+                        .collect()
+                } else if let Value::Null = value {
+                    vec![None]
+                } else if !value.is_object() {
+                    vec![Some(value.to_string())]
+                } else {
+                    return Err(ArrowError::JsonError(
+                        "Only scalars are currently supported in JSON arrays".to_string(),
+                    ));
+                };
+
+                // TODO: (vertexclique): APIs of dictionary arrays and others are different. Unify them.
+                match data_type {
+                    DataType::Utf8 => {
+                        let builder: &mut &mut ListBuilder<StringBuilder> = unsafe {

Review comment:
       I am also very happy with this: there is likely a more idiomatic way of performing this.
   
   Could you document / add comments about what is happening here?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] github-actions[bot] commented on pull request #8430: ARROW-10249: [Rust] Support nested dictionaries inside list arrays

Posted by GitBox <gi...@apache.org>.
github-actions[bot] commented on pull request #8430:
URL: https://github.com/apache/arrow/pull/8430#issuecomment-706607598


   https://issues.apache.org/jira/browse/ARROW-10249


----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] vertexclique commented on a change in pull request #8430: ARROW-10249: [Rust] Support nested dictionaries inside list arrays

Posted by GitBox <gi...@apache.org>.
vertexclique commented on a change in pull request #8430:
URL: https://github.com/apache/arrow/pull/8430#discussion_r503288791



##########
File path: rust/arrow/src/json/reader.rs
##########
@@ -612,6 +644,240 @@ impl<R: Read> Reader<R> {
         arrays.and_then(|arr| RecordBatch::try_new(projected_schema, arr).map(Some))
     }
 
+    fn build_wrapped_list_array(
+        &self,
+        rows: &[Value],
+        col_name: &str,
+        key_type: &DataType,
+    ) -> Result<ArrayRef> {
+        match *key_type {
+            DataType::Int8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int8Type>(&dtype, col_name, rows)
+            }
+            DataType::Int16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int16Type>(&dtype, col_name, rows)
+            }
+            DataType::Int32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int32Type>(&dtype, col_name, rows)
+            }
+            DataType::Int64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int64Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt8Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt16Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt32Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt64Type>(&dtype, col_name, rows)
+            }
+            ref e => Err(ArrowError::JsonError(format!(
+                "Data type is currently not supported for dictionaries in list : {:?}",
+                e
+            ))),
+        }
+    }
+
+    #[inline(always)]
+    fn list_array_string_array_builder<DICT_TY>(

Review comment:
       > I think splitting this method into two, one for plain Utf8 and one for dictionary encoded, would simplify the code and should allow it to work without dynamic dispatch.
   
   There is no dynamic dispatch going on here. dyn ArrayBuilder is converted down to the concrete type. So no dynamic dispatch is going on.
   
   > There is only a bit of common code converting the json into a vec (line 749) which could be extracted and reused.
   
   If extracted, loop contains different builders with different build methods as mentioned in:
   https://github.com/apache/arrow/pull/8430/files#diff-3fa6fc3c0ee201f5a3a1a5d25d0062ffR775
   So that wouldn't work, because you need code duplication with the same parameters of this method, which doesn't bring any value again.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] jorgecarleitao commented on a change in pull request #8430: ARROW-10249: [Rust] Support nested dictionaries inside list arrays

Posted by GitBox <gi...@apache.org>.
jorgecarleitao commented on a change in pull request #8430:
URL: https://github.com/apache/arrow/pull/8430#discussion_r504385827



##########
File path: rust/arrow/src/json/reader.rs
##########
@@ -612,6 +644,240 @@ impl<R: Read> Reader<R> {
         arrays.and_then(|arr| RecordBatch::try_new(projected_schema, arr).map(Some))
     }
 
+    fn build_wrapped_list_array(
+        &self,
+        rows: &[Value],
+        col_name: &str,
+        key_type: &DataType,
+    ) -> Result<ArrayRef> {
+        match *key_type {
+            DataType::Int8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int8Type>(&dtype, col_name, rows)
+            }
+            DataType::Int16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int16Type>(&dtype, col_name, rows)
+            }
+            DataType::Int32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int32Type>(&dtype, col_name, rows)
+            }
+            DataType::Int64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int64Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt8Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt16Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt32Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt64Type>(&dtype, col_name, rows)
+            }
+            ref e => Err(ArrowError::JsonError(format!(
+                "Data type is currently not supported for dictionaries in list : {:?}",
+                e
+            ))),
+        }
+    }
+
+    #[inline(always)]
+    fn list_array_string_array_builder<DICT_TY>(
+        &self,
+        data_type: &DataType,
+        col_name: &str,
+        rows: &[Value],
+    ) -> Result<ArrayRef>
+    where
+        DICT_TY: ArrowPrimitiveType + ArrowDictionaryKeyType,
+    {
+        let builder: Box<dyn ArrayBuilder> = match data_type {
+            DataType::Utf8 => {
+                let values_builder = StringBuilder::new(rows.len() * 5);
+                Box::new(ListBuilder::new(values_builder))
+            }
+            DataType::Dictionary(_, _) => {
+                let values_builder =
+                    self.build_string_dictionary_builder::<DICT_TY>(rows.len() * 5)?;
+                Box::new(ListBuilder::new(values_builder))
+            }
+            e => {
+                return Err(ArrowError::JsonError(format!(
+                    "Nested list data builder type is not supported: {:?}",
+                    e
+                )))
+            }
+        };
+        let mut builder = Box::leak(builder);
+
+        for row in rows {
+            if let Some(value) = row.get(col_name) {
+                // value can be an array or a scalar
+                let vals: Vec<Option<String>> = if let Value::String(v) = value {
+                    vec![Some(v.to_string())]
+                } else if let Value::Array(n) = value {
+                    n.iter()
+                        .map(|v: &Value| {
+                            if v.is_string() {
+                                Some(v.as_str().unwrap().to_string())
+                            } else if v.is_array() || v.is_object() {
+                                // implicitly drop nested values
+                                // TODO support deep-nesting
+                                None
+                            } else {
+                                Some(v.to_string())
+                            }
+                        })
+                        .collect()
+                } else if let Value::Null = value {
+                    vec![None]
+                } else if !value.is_object() {
+                    vec![Some(value.to_string())]
+                } else {
+                    return Err(ArrowError::JsonError(
+                        "Only scalars are currently supported in JSON arrays".to_string(),
+                    ));
+                };
+
+                // TODO: (vertexclique): APIs of dictionary arrays and others are different. Unify them.
+                match data_type {
+                    DataType::Utf8 => {
+                        let builder: &mut &mut ListBuilder<StringBuilder> = unsafe {

Review comment:
       Could you document / add comments about what is happening here?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] andygrove commented on a change in pull request #8430: ARROW-10249: [Rust] Support nested dictionaries inside list arrays

Posted by GitBox <gi...@apache.org>.
andygrove commented on a change in pull request #8430:
URL: https://github.com/apache/arrow/pull/8430#discussion_r508527928



##########
File path: rust/arrow/src/json/reader.rs
##########
@@ -612,6 +644,240 @@ impl<R: Read> Reader<R> {
         arrays.and_then(|arr| RecordBatch::try_new(projected_schema, arr).map(Some))
     }
 
+    fn build_wrapped_list_array(
+        &self,
+        rows: &[Value],
+        col_name: &str,
+        key_type: &DataType,
+    ) -> Result<ArrayRef> {
+        match *key_type {
+            DataType::Int8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int8Type>(&dtype, col_name, rows)
+            }
+            DataType::Int16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int16Type>(&dtype, col_name, rows)
+            }
+            DataType::Int32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int32Type>(&dtype, col_name, rows)
+            }
+            DataType::Int64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int64Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt8Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt16Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt32Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt64Type>(&dtype, col_name, rows)
+            }
+            ref e => Err(ArrowError::JsonError(format!(
+                "Data type is currently not supported for dictionaries in list : {:?}",
+                e
+            ))),
+        }
+    }
+
+    #[inline(always)]
+    fn list_array_string_array_builder<DICT_TY>(
+        &self,
+        data_type: &DataType,
+        col_name: &str,
+        rows: &[Value],
+    ) -> Result<ArrayRef>
+    where
+        DICT_TY: ArrowPrimitiveType + ArrowDictionaryKeyType,
+    {
+        let builder: Box<dyn ArrayBuilder> = match data_type {
+            DataType::Utf8 => {
+                let values_builder = StringBuilder::new(rows.len() * 5);
+                Box::new(ListBuilder::new(values_builder))
+            }
+            DataType::Dictionary(_, _) => {
+                let values_builder =
+                    self.build_string_dictionary_builder::<DICT_TY>(rows.len() * 5)?;
+                Box::new(ListBuilder::new(values_builder))
+            }
+            e => {
+                return Err(ArrowError::JsonError(format!(
+                    "Nested list data builder type is not supported: {:?}",
+                    e
+                )))
+            }
+        };
+        let mut builder = Box::leak(builder);
+
+        for row in rows {
+            if let Some(value) = row.get(col_name) {
+                // value can be an array or a scalar
+                let vals: Vec<Option<String>> = if let Value::String(v) = value {
+                    vec![Some(v.to_string())]
+                } else if let Value::Array(n) = value {
+                    n.iter()
+                        .map(|v: &Value| {
+                            if v.is_string() {
+                                Some(v.as_str().unwrap().to_string())
+                            } else if v.is_array() || v.is_object() {
+                                // implicitly drop nested values
+                                // TODO support deep-nesting
+                                None
+                            } else {
+                                Some(v.to_string())
+                            }
+                        })
+                        .collect()
+                } else if let Value::Null = value {
+                    vec![None]
+                } else if !value.is_object() {
+                    vec![Some(value.to_string())]
+                } else {
+                    return Err(ArrowError::JsonError(
+                        "Only scalars are currently supported in JSON arrays".to_string(),
+                    ));
+                };
+
+                // TODO: (vertexclique): APIs of dictionary arrays and others are different. Unify them.
+                match data_type {
+                    DataType::Utf8 => {
+                        let builder: &mut &mut ListBuilder<StringBuilder> = unsafe {
+                            &mut *(&mut builder as *mut &mut dyn ArrayBuilder
+                                as *mut &mut ListBuilder<StringBuilder>)
+                        };
+                        for val in vals {
+                            if let Some(v) = val {
+                                builder.values().append_value(&v)?
+                            } else {
+                                builder.values().append_null()?
+                            };
+                        }
+
+                        // Append to the list
+                        builder.append(true)?;
+                    }
+                    DataType::Dictionary(_, _) => {
+                        let builder: &mut &mut ListBuilder<
+                            StringDictionaryBuilder<DICT_TY>,
+                        > = unsafe {
+                            &mut *(&mut builder as *mut &mut dyn ArrayBuilder
+                                as *mut &mut ListBuilder<
+                                    StringDictionaryBuilder<DICT_TY>,
+                                >)
+                        };
+                        for val in vals {
+                            if let Some(v) = val {
+                                let _ = builder.values().append(&v)?;
+                            } else {
+                                builder.values().append_null()?
+                            };
+                        }
+
+                        // Append to the list
+                        builder.append(true)?;
+                    }
+                    e => {
+                        return Err(ArrowError::JsonError(format!(
+                            "Nested list data builder type is not supported: {:?}",
+                            e
+                        )))
+                    }
+                }
+            }
+        }
+        unsafe { Ok((*Box::from_raw(builder)).finish() as ArrayRef) }

Review comment:
       I share these concerns and agree that if there is a compelling argument for this we need to have it documented.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org



[GitHub] [arrow] nevi-me commented on a change in pull request #8430: ARROW-10249: [Rust] Support nested dictionaries inside list arrays

Posted by GitBox <gi...@apache.org>.
nevi-me commented on a change in pull request #8430:
URL: https://github.com/apache/arrow/pull/8430#discussion_r503472334



##########
File path: rust/arrow/src/json/reader.rs
##########
@@ -612,6 +644,240 @@ impl<R: Read> Reader<R> {
         arrays.and_then(|arr| RecordBatch::try_new(projected_schema, arr).map(Some))
     }
 
+    fn build_wrapped_list_array(
+        &self,
+        rows: &[Value],
+        col_name: &str,
+        key_type: &DataType,
+    ) -> Result<ArrayRef> {
+        match *key_type {
+            DataType::Int8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int8Type>(&dtype, col_name, rows)
+            }
+            DataType::Int16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int16Type>(&dtype, col_name, rows)
+            }
+            DataType::Int32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int32Type>(&dtype, col_name, rows)
+            }
+            DataType::Int64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::Int64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<Int64Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt8 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt8),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt8Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt16 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt16),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt16Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt32 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt32),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt32Type>(&dtype, col_name, rows)
+            }
+            DataType::UInt64 => {
+                let dtype = DataType::Dictionary(
+                    Box::new(DataType::UInt64),
+                    Box::new(DataType::Utf8),
+                );
+                self.list_array_string_array_builder::<UInt64Type>(&dtype, col_name, rows)
+            }
+            ref e => Err(ArrowError::JsonError(format!(
+                "Data type is currently not supported for dictionaries in list : {:?}",
+                e
+            ))),
+        }
+    }
+
+    #[inline(always)]
+    fn list_array_string_array_builder<DICT_TY>(
+        &self,
+        data_type: &DataType,
+        col_name: &str,
+        rows: &[Value],
+    ) -> Result<ArrayRef>
+    where
+        DICT_TY: ArrowPrimitiveType + ArrowDictionaryKeyType,
+    {
+        let builder: Box<dyn ArrayBuilder> = match data_type {
+            DataType::Utf8 => {
+                let values_builder = StringBuilder::new(rows.len() * 5);
+                Box::new(ListBuilder::new(values_builder))
+            }
+            DataType::Dictionary(_, _) => {
+                let values_builder =
+                    self.build_string_dictionary_builder::<DICT_TY>(rows.len() * 5)?;
+                Box::new(ListBuilder::new(values_builder))
+            }
+            e => {
+                return Err(ArrowError::JsonError(format!(
+                    "Nested list data builder type is not supported: {:?}",
+                    e
+                )))
+            }
+        };
+        let mut builder = Box::leak(builder);
+
+        for row in rows {
+            if let Some(value) = row.get(col_name) {
+                // value can be an array or a scalar
+                let vals: Vec<Option<String>> = if let Value::String(v) = value {
+                    vec![Some(v.to_string())]
+                } else if let Value::Array(n) = value {
+                    n.iter()
+                        .map(|v: &Value| {
+                            if v.is_string() {
+                                Some(v.as_str().unwrap().to_string())
+                            } else if v.is_array() || v.is_object() {
+                                // implicitly drop nested values
+                                // TODO support deep-nesting
+                                None
+                            } else {
+                                Some(v.to_string())
+                            }
+                        })
+                        .collect()
+                } else if let Value::Null = value {
+                    vec![None]
+                } else if !value.is_object() {
+                    vec![Some(value.to_string())]
+                } else {
+                    return Err(ArrowError::JsonError(
+                        "Only scalars are currently supported in JSON arrays".to_string(),
+                    ));
+                };
+
+                // TODO: (vertexclique): APIs of dictionary arrays and others are different. Unify them.
+                match data_type {
+                    DataType::Utf8 => {
+                        let builder: &mut &mut ListBuilder<StringBuilder> = unsafe {
+                            &mut *(&mut builder as *mut &mut dyn ArrayBuilder
+                                as *mut &mut ListBuilder<StringBuilder>)
+                        };
+                        for val in vals {
+                            if let Some(v) = val {
+                                builder.values().append_value(&v)?
+                            } else {
+                                builder.values().append_null()?
+                            };
+                        }
+
+                        // Amend to the list
+                        builder.append(true)?;

Review comment:
       amend or append?




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org