You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/11/03 14:15:33 UTC

[GitHub] [arrow] vertexclique commented on a change in pull request #8561: ARROW-10449 [Rust] Make Dictionary::keys be an array

vertexclique commented on a change in pull request #8561:
URL: https://github.com/apache/arrow/pull/8561#discussion_r516550528



##########
File path: rust/arrow/src/array/array.rs
##########
@@ -2319,13 +2221,24 @@ impl<T: ArrowPrimitiveType> From<ArrayDataRef> for DictionaryArray<T> {
             "DictionaryArray should contain a single child array (values)."
         );
 
-        let raw_values = data.buffers()[0].raw_data();
-        let dtype: &DataType = data.data_type();
-        let values = make_array(data.child_data()[0].clone());
-        if let DataType::Dictionary(_, _) = dtype {
+        if let DataType::Dictionary(key_data_type, _) = data.data_type() {
+            if key_data_type.as_ref() != &T::DATA_TYPE {
+                panic!("DictionaryArray's data type must match.")

Review comment:
       ```suggestion
                   unreachable!("DictionaryArray's data type must match.")
   ```
   Since the former is good for defensive programming but doesn't convey the idea.

##########
File path: rust/arrow/src/array/array.rs
##########
@@ -2115,28 +2115,32 @@ impl From<(Vec<(Field, ArrayRef)>, Buffer, usize)> for StructArray {
 /// Example **with nullable** data:
 ///
 /// ```
-/// use arrow::array::DictionaryArray;
+/// use arrow::array::{DictionaryArray, Int8Array};
 /// use arrow::datatypes::Int8Type;
 /// let test = vec!["a", "a", "b", "c"];
 /// let array : DictionaryArray<Int8Type> = test.iter().map(|&x| if x == "b" {None} else {Some(x)}).collect();
-/// assert_eq!(array.keys().collect::<Vec<Option<i8>>>(), vec![Some(0), Some(0), None, Some(1)]);
+/// assert_eq!(array.keys(), &Int8Array::from(vec![Some(0), Some(0), None, Some(1)]));
 /// ```
 ///
 /// Example **without nullable** data:
 ///
 /// ```
-/// use arrow::array::DictionaryArray;
+/// use arrow::array::{DictionaryArray, Int8Array};
 /// use arrow::datatypes::Int8Type;
 /// let test = vec!["a", "a", "b", "c"];
 /// let array : DictionaryArray<Int8Type> = test.into_iter().collect();
-/// assert_eq!(array.keys().collect::<Vec<Option<i8>>>(), vec![Some(0), Some(0), Some(1), Some(2)]);
+/// assert_eq!(array.keys(), &Int8Array::from(vec![0, 0, 1, 2]));
 /// ```
 pub struct DictionaryArray<K: ArrowPrimitiveType> {
-    /// Array of keys, stored as a PrimitiveArray<K>.
+    /// data of this dictionary. Note that this is _not_ compatible with the C Data interface,
+    /// as, in the current implementation, `values` below are the first child of this struct.
     data: ArrayDataRef,
 
-    /// Pointer to the key values.
-    raw_values: RawPtrBox<K::Native>,
+    /// data of the keys of this dictionary. These are constructed from the buffer and null bitmap

Review comment:
       ```suggestion
       /// Data of the keys of this dictionary. These are constructed from the buffer and null bitmap
   ```

##########
File path: rust/arrow/src/array/array.rs
##########
@@ -2319,13 +2221,24 @@ impl<T: ArrowPrimitiveType> From<ArrayDataRef> for DictionaryArray<T> {
             "DictionaryArray should contain a single child array (values)."
         );
 
-        let raw_values = data.buffers()[0].raw_data();
-        let dtype: &DataType = data.data_type();
-        let values = make_array(data.child_data()[0].clone());
-        if let DataType::Dictionary(_, _) = dtype {
+        if let DataType::Dictionary(key_data_type, _) = data.data_type() {
+            if key_data_type.as_ref() != &T::DATA_TYPE {
+                panic!("DictionaryArray's data type must match.")
+            };
+            // create a zero-copy of the keys' data
+            let keys = PrimitiveArray::<T>::from(Arc::new(ArrayData::new(
+                T::DATA_TYPE,
+                data.len(),
+                Some(data.null_count()),
+                data.null_buffer().map(|b| b.clone()),

Review comment:
       ```suggestion
                   data.null_buffer().clone(),
   ```
   .clone should also work over the option of clonables.

##########
File path: rust/parquet/src/arrow/arrow_writer.rs
##########
@@ -283,48 +280,40 @@ trait Materialize<K, V> {
     fn materialize(&self) -> Vec<Self::Output>;
 }
 
-macro_rules! materialize_string {
-    ($($k:ty,)*) => {
-        $(impl Materialize<$k, arrow_array::StringArray> for dyn Array {
-            type Output = ByteArray;
-
-            fn materialize(&self) -> Vec<Self::Output> {
-                use std::convert::TryFrom;
-
-                let typed_array = self.as_any()
-                    .downcast_ref::<$k>()
-                    .expect("Unable to get dictionary array");
-
-                let keys = typed_array.keys();
-
-                let value_buffer = typed_array.values();
-                let values = value_buffer
-                    .as_any()
-                    .downcast_ref::<arrow_array::StringArray>()
-                    .unwrap();
-
-                // This removes NULL values from the NullableIter, but
-                // they're encoded by the levels, so that's fine.
-                keys
-                    .flatten()
-                    .map(|key| usize::try_from(key).unwrap_or_else(|k| panic!("key {} does not fit in usize", k)))
-                    .map(|key| values.value(key))
-                    .map(ByteArray::from)
-                    .collect()
-            }
-        })*
-    };
-}
+impl<K> Materialize<K, arrow_array::StringArray> for dyn Array
+where
+    K: arrow::datatypes::ArrowDictionaryKeyType,
+{
+    type Output = ByteArray;
+
+    fn materialize(&self) -> Vec<Self::Output> {
+        use arrow::datatypes::ArrowNativeType;
 
-materialize_string! {
-    arrow_array::Int8DictionaryArray,
-    arrow_array::Int16DictionaryArray,
-    arrow_array::Int32DictionaryArray,
-    arrow_array::Int64DictionaryArray,
-    arrow_array::UInt8DictionaryArray,
-    arrow_array::UInt16DictionaryArray,
-    arrow_array::UInt32DictionaryArray,
-    arrow_array::UInt64DictionaryArray,
+        let typed_array = self
+            .as_any()
+            .downcast_ref::<arrow_array::DictionaryArray<K>>()
+            .expect("Unable to get dictionary array");
+
+        let keys = typed_array.keys();
+
+        let value_buffer = typed_array.values();
+        let values = value_buffer
+            .as_any()
+            .downcast_ref::<arrow_array::StringArray>()
+            .unwrap();
+
+        // This removes NULL values from the keys, but
+        // they're encoded by the levels, so that's fine.
+        keys.into_iter()
+            .flatten()
+            .map(|key| {
+                key.to_usize()
+                    .unwrap_or_else(|| panic!("key {:?} does not fit in usize", key))
+            })
+            .map(|key| values.value(key))
+            .map(ByteArray::from)
+            .collect()

Review comment:
       This is coming from parquet code @carols10cents wrote. Any particular reason this is landing here with Materialize trait?

##########
File path: rust/arrow/src/array/array.rs
##########
@@ -2115,28 +2115,32 @@ impl From<(Vec<(Field, ArrayRef)>, Buffer, usize)> for StructArray {
 /// Example **with nullable** data:
 ///
 /// ```
-/// use arrow::array::DictionaryArray;
+/// use arrow::array::{DictionaryArray, Int8Array};
 /// use arrow::datatypes::Int8Type;
 /// let test = vec!["a", "a", "b", "c"];
 /// let array : DictionaryArray<Int8Type> = test.iter().map(|&x| if x == "b" {None} else {Some(x)}).collect();
-/// assert_eq!(array.keys().collect::<Vec<Option<i8>>>(), vec![Some(0), Some(0), None, Some(1)]);
+/// assert_eq!(array.keys(), &Int8Array::from(vec![Some(0), Some(0), None, Some(1)]));
 /// ```
 ///
 /// Example **without nullable** data:
 ///
 /// ```
-/// use arrow::array::DictionaryArray;
+/// use arrow::array::{DictionaryArray, Int8Array};
 /// use arrow::datatypes::Int8Type;
 /// let test = vec!["a", "a", "b", "c"];
 /// let array : DictionaryArray<Int8Type> = test.into_iter().collect();
-/// assert_eq!(array.keys().collect::<Vec<Option<i8>>>(), vec![Some(0), Some(0), Some(1), Some(2)]);
+/// assert_eq!(array.keys(), &Int8Array::from(vec![0, 0, 1, 2]));
 /// ```
 pub struct DictionaryArray<K: ArrowPrimitiveType> {
-    /// Array of keys, stored as a PrimitiveArray<K>.
+    /// data of this dictionary. Note that this is _not_ compatible with the C Data interface,

Review comment:
       ```suggestion
       /// Data of this dictionary. Note that this is _not_ compatible with the C Data interface,
   ```

##########
File path: rust/arrow/src/array/array.rs
##########
@@ -2398,30 +2311,23 @@ impl<T: ArrowPrimitiveType> Array for DictionaryArray<T> {
 
     /// Returns the total number of bytes of memory occupied by the buffers owned by this [DictionaryArray].
     fn get_buffer_memory_size(&self) -> usize {
-        self.data.get_buffer_memory_size() + self.values().get_buffer_memory_size()
+        // Since both `keys` and `values` derive (are references from) `data`, we only account for `data`.
+        self.data.get_array_memory_size()

Review comment:
       This part doesn't look true. Am I missing something? Since keys array is also a primitive array, and values are string array both of the arrays' methods for buffer_memory_size should be called and summed up respectively.
   
   Same applies for the array memory size too. Since `zerocopy` is not that much of a zero-copy in the physical memory level it would be nice to explicitly add them in array size calculation too.




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org