You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by vi...@apache.org on 2022/05/09 19:31:42 UTC
[arrow-rs] branch master updated: Support dictionary arrays in length and bit_length (#1674)
This is an automated email from the ASF dual-hosted git repository.
viirya pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 42c9e025c Support dictionary arrays in length and bit_length (#1674)
42c9e025c is described below
commit 42c9e025c4af3958a4d45ae54d2d9d267cce2fa6
Author: Liang-Chi Hsieh <vi...@gmail.com>
AuthorDate: Mon May 9 12:31:37 2022 -0700
Support dictionary arrays in length and bit_length (#1674)
* Support dictionary arrays in length and bit_length
* Fix typo
---
arrow/src/compute/kernels/length.rs | 165 ++++++++++++++++++++++++++++++++++--
1 file changed, 160 insertions(+), 5 deletions(-)
diff --git a/arrow/src/compute/kernels/length.rs b/arrow/src/compute/kernels/length.rs
index e52035970..a68aa2bde 100644
--- a/arrow/src/compute/kernels/length.rs
+++ b/arrow/src/compute/kernels/length.rs
@@ -19,10 +19,12 @@
use crate::{array::*, buffer::Buffer, datatypes::ArrowPrimitiveType};
use crate::{
- datatypes::{DataType, Int32Type, Int64Type},
+ datatypes::*,
error::{ArrowError, Result},
};
+use std::sync::Arc;
+
macro_rules! unary_offsets {
($array: expr, $data_type: expr, $op: expr) => {{
let slice = $array.value_offsets();
@@ -56,6 +58,27 @@ macro_rules! unary_offsets {
}};
}
+macro_rules! kernel_dict {
+ ($array: ident, $kernel: expr, $kt: ident, $($t: ident: $gt: ident), *) => {
+ match $kt.as_ref() {
+ $(&DataType::$t => {
+ let dict = $array
+ .as_any()
+ .downcast_ref::<DictionaryArray<$gt>>()
+ .unwrap_or_else(|| {
+ panic!("Expect 'DictionaryArray<{}>' but got array of data type {:?}",
+ stringify!($gt), $array.data_type())
+ });
+ let values = $kernel(dict.values())?;
+ let result = DictionaryArray::try_new(dict.keys(), &values)?;
+ Ok(Arc::new(result))
+ },
+ )*
+ t => panic!("Unsupported dictionary key type: {}", t)
+ }
+ }
+}
+
fn length_list<O, T>(array: &dyn Array) -> ArrayRef
where
O: OffsetSizeTrait,
@@ -127,10 +150,26 @@ where
/// For list array, length is the number of elements in each list.
/// For string array and binary array, length is the number of bytes of each value.
///
-/// * this only accepts ListArray/LargeListArray, StringArray/LargeStringArray and BinaryArray/LargeBinaryArray
+/// * this only accepts ListArray/LargeListArray, StringArray/LargeStringArray and BinaryArray/LargeBinaryArray,
+/// or DictionaryArray with above Arrays as values
/// * length of null is null.
pub fn length(array: &dyn Array) -> Result<ArrayRef> {
match array.data_type() {
+ DataType::Dictionary(kt, _) => {
+ kernel_dict!(
+ array,
+ |a| { length(a) },
+ kt,
+ Int8: Int8Type,
+ Int16: Int16Type,
+ Int32: Int32Type,
+ Int64: Int64Type,
+ UInt8: UInt8Type,
+ UInt16: UInt16Type,
+ UInt32: UInt32Type,
+ UInt64: UInt64Type
+ )
+ }
DataType::List(_) => Ok(length_list::<i32, Int32Type>(array)),
DataType::LargeList(_) => Ok(length_list::<i64, Int64Type>(array)),
DataType::Utf8 => Ok(length_string::<i32, Int32Type>(array)),
@@ -146,11 +185,27 @@ pub fn length(array: &dyn Array) -> Result<ArrayRef> {
/// Returns an array of Int32/Int64 denoting the number of bits in each value in the array.
///
-/// * this only accepts StringArray/Utf8, LargeString/LargeUtf8, BinaryArray and LargeBinaryArray
+/// * this only accepts StringArray/Utf8, LargeString/LargeUtf8, BinaryArray and LargeBinaryArray,
+/// or DictionaryArray with above Arrays as values
/// * bit_length of null is null.
/// * bit_length is in number of bits
pub fn bit_length(array: &dyn Array) -> Result<ArrayRef> {
match array.data_type() {
+ DataType::Dictionary(kt, _) => {
+ kernel_dict!(
+ array,
+ |a| { bit_length(a) },
+ kt,
+ Int8: Int8Type,
+ Int16: Int16Type,
+ Int32: Int32Type,
+ Int64: Int64Type,
+ UInt8: UInt8Type,
+ UInt16: UInt16Type,
+ UInt32: UInt32Type,
+ UInt64: UInt64Type
+ )
+ }
DataType::Utf8 => Ok(bit_length_string::<i32, Int32Type>(array)),
DataType::LargeUtf8 => Ok(bit_length_string::<i64, Int64Type>(array)),
DataType::Binary => Ok(bit_length_binary::<i32, Int32Type>(array)),
@@ -164,8 +219,6 @@ pub fn bit_length(array: &dyn Array) -> Result<ArrayRef> {
#[cfg(test)]
mod tests {
- use crate::datatypes::{Float32Type, Int8Type};
-
use super::*;
fn double_vec<T: Clone>(v: Vec<T>) -> Vec<T> {
@@ -570,4 +623,106 @@ mod tests {
Ok(())
}
+
+ #[test]
+ fn length_dictionary() -> Result<()> {
+ _length_dictionary::<Int8Type>()?;
+ _length_dictionary::<Int16Type>()?;
+ _length_dictionary::<Int32Type>()?;
+ _length_dictionary::<Int64Type>()?;
+ _length_dictionary::<UInt8Type>()?;
+ _length_dictionary::<UInt16Type>()?;
+ _length_dictionary::<UInt32Type>()?;
+ _length_dictionary::<UInt64Type>()?;
+ Ok(())
+ }
+
+ fn _length_dictionary<K: ArrowDictionaryKeyType>() -> Result<()> {
+ const TOTAL: i32 = 100;
+
+ let v = ["aaaa", "bb", "ccccc", "ddd", "eeeeee"];
+ let data: Vec<Option<&str>> = (0..TOTAL)
+ .map(|n| {
+ let i = n % 5;
+ if i == 3 {
+ None
+ } else {
+ Some(v[i as usize])
+ }
+ })
+ .collect();
+
+ let dict_array: DictionaryArray<K> = data.clone().into_iter().collect();
+
+ let expected: Vec<Option<i32>> =
+ data.iter().map(|opt| opt.map(|s| s.len() as i32)).collect();
+
+ let res = length(&dict_array)?;
+ let actual = res.as_any().downcast_ref::<DictionaryArray<K>>().unwrap();
+ let actual: Vec<Option<i32>> = actual
+ .values()
+ .as_any()
+ .downcast_ref::<Int32Array>()
+ .unwrap()
+ .take_iter(dict_array.keys_iter())
+ .collect();
+
+ for i in 0..TOTAL as usize {
+ assert_eq!(expected[i], actual[i],);
+ }
+
+ Ok(())
+ }
+
+ #[test]
+ fn bit_length_dictionary() -> Result<()> {
+ _bit_length_dictionary::<Int8Type>()?;
+ _bit_length_dictionary::<Int16Type>()?;
+ _bit_length_dictionary::<Int32Type>()?;
+ _bit_length_dictionary::<Int64Type>()?;
+ _bit_length_dictionary::<UInt8Type>()?;
+ _bit_length_dictionary::<UInt16Type>()?;
+ _bit_length_dictionary::<UInt32Type>()?;
+ _bit_length_dictionary::<UInt64Type>()?;
+ Ok(())
+ }
+
+ fn _bit_length_dictionary<K: ArrowDictionaryKeyType>() -> Result<()> {
+ const TOTAL: i32 = 100;
+
+ let v = ["aaaa", "bb", "ccccc", "ddd", "eeeeee"];
+ let data: Vec<Option<&str>> = (0..TOTAL)
+ .map(|n| {
+ let i = n % 5;
+ if i == 3 {
+ None
+ } else {
+ Some(v[i as usize])
+ }
+ })
+ .collect();
+
+ let dict_array: DictionaryArray<K> = data.clone().into_iter().collect();
+
+ let expected: Vec<Option<i32>> = data
+ .iter()
+ .map(|opt| opt.map(|s| (s.chars().count() * 8) as i32))
+ .collect();
+
+ let res = bit_length(&dict_array)?;
+ let actual = res.as_any().downcast_ref::<DictionaryArray<K>>().unwrap();
+ let actual: Vec<Option<i32>> = actual
+ .values()
+ .as_any()
+ .downcast_ref::<Int32Array>()
+ .unwrap()
+ .take_iter(dict_array.keys_iter())
+ .collect();
+
+ for i in 0..TOTAL as usize {
+ assert_eq!(expected[i], actual[i],);
+ }
+
+ Ok(())
+ }
}