You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/02/15 20:12:27 UTC
[arrow-rs] branch master updated: Add `DictionaryArray::try_new()` to create dictionaries from pre existing arrays (#1300)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 7d46ac1 Add `DictionaryArray::try_new()` to create dictionaries from pre existing arrays (#1300)
7d46ac1 is described below
commit 7d46ac1ffeafdbbdb708ddccc4dbda8988f11a77
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Tue Feb 15 15:12:21 2022 -0500
Add `DictionaryArray::try_new()` to create dictionaries from pre existing arrays (#1300)
* Add DictionaryArray::try_new()
* Update arrow/src/array/array_dictionary.rs
Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
Co-authored-by: Liang-Chi Hsieh <vi...@gmail.com>
---
arrow/src/array/array_dictionary.rs | 91 ++++++++++++++++++--
arrow/src/array/equal/mod.rs | 8 +-
arrow/src/compute/kernels/comparison.rs | 148 ++++++++++----------------------
3 files changed, 134 insertions(+), 113 deletions(-)
diff --git a/arrow/src/array/array_dictionary.rs b/arrow/src/array/array_dictionary.rs
index 75d4f3a..5d850c9 100644
--- a/arrow/src/array/array_dictionary.rs
+++ b/arrow/src/array/array_dictionary.rs
@@ -26,6 +26,7 @@ use super::{
};
use crate::datatypes::ArrowNativeType;
use crate::datatypes::{ArrowDictionaryKeyType, ArrowPrimitiveType, DataType};
+use crate::error::Result;
/// A dictionary array where each element is a single value indexed by an integer key.
/// This is mostly used to represent strings or a limited set of primitive types as integers,
@@ -50,15 +51,31 @@ use crate::datatypes::{ArrowDictionaryKeyType, ArrowPrimitiveType, DataType};
/// let array : DictionaryArray<Int8Type> = test.into_iter().collect();
/// assert_eq!(array.keys(), &Int8Array::from(vec![0, 0, 1, 2]));
/// ```
+///
+/// Example from existing arrays:
+///
+/// ```
+/// use arrow::array::{DictionaryArray, Int8Array, StringArray};
+/// use arrow::datatypes::Int8Type;
+/// // You can form your own DictionaryArray by providing the
+/// // values (dictionary) and keys (indexes into the dictionary):
+/// let values = StringArray::from_iter_values(["a", "b", "c"]);
+/// let keys = Int8Array::from_iter_values([0, 0, 1, 2]);
+/// let array = DictionaryArray::<Int8Type>::try_new(&keys, &values).unwrap();
+/// let expected: DictionaryArray::<Int8Type> = vec!["a", "a", "b", "c"]
+/// .into_iter()
+/// .collect();
+/// assert_eq!(&array, &expected);
+/// ```
pub struct DictionaryArray<K: ArrowPrimitiveType> {
/// Data of this dictionary. Note that this is _not_ compatible with the C Data interface,
/// as, in the current implementation, `values` below are the first child of this struct.
data: ArrayData,
- /// The keys of this dictionary. These are constructed from the buffer and null bitmap
- /// of `data`.
- /// Also, note that these do not correspond to the true values of this array. Rather, they map
- /// to the real values.
+ /// The keys of this dictionary. These are constructed from the
+ /// buffer and null bitmap of `data`. Also, note that these do
+ /// not correspond to the true values of this array. Rather, they
+ /// map to the real values.
keys: PrimitiveArray<K>,
/// Array of dictionary values (can by any DataType).
@@ -69,6 +86,27 @@ pub struct DictionaryArray<K: ArrowPrimitiveType> {
}
impl<'a, K: ArrowPrimitiveType> DictionaryArray<K> {
+ /// Attempt to create a new DictionaryArray with a specified keys
+ /// (indexes into the dictionary) and values (dictionary)
+ /// array. Returns an error if there are any keys that are outside
+ /// of the dictionary array.
+ pub fn try_new(keys: &PrimitiveArray<K>, values: &dyn Array) -> Result<Self> {
+ let dict_data_type = DataType::Dictionary(
+ Box::new(keys.data_type().clone()),
+ Box::new(values.data_type().clone()),
+ );
+
+ // Note: This does more work than necessary by rebuilding /
+ // revalidating all the data
+ let data = ArrayData::builder(dict_data_type)
+ .len(keys.len())
+ .add_buffer(keys.data().buffers()[0].clone())
+ .add_child_data(values.data().clone())
+ .build()?;
+
+ Ok(data.into())
+ }
+
/// Return an array view of the keys of this dictionary as a PrimitiveArray.
pub fn keys(&self) -> &PrimitiveArray<K> {
&self.keys
@@ -257,13 +295,13 @@ mod tests {
use super::*;
use crate::{
- array::Int16Array,
- datatypes::{Int32Type, Int8Type, UInt32Type, UInt8Type},
- };
- use crate::{
array::Int16DictionaryArray, array::PrimitiveDictionaryBuilder,
datatypes::DataType,
};
+ use crate::{
+ array::{Int16Array, Int32Array},
+ datatypes::{Int32Type, Int8Type, UInt32Type, UInt8Type},
+ };
use crate::{buffer::Buffer, datatypes::ToByteSlice};
#[test]
@@ -422,4 +460,41 @@ mod tests {
.validate_full()
.expect("All null array has valid array data");
}
+
+ #[test]
+ fn test_try_new() {
+ let values: StringArray = [Some("foo"), Some("bar"), Some("baz")]
+ .into_iter()
+ .collect();
+ let keys: Int32Array = [Some(0), Some(2), None, Some(1)].into_iter().collect();
+
+ let array = DictionaryArray::<Int32Type>::try_new(&keys, &values).unwrap();
+ assert_eq!(array.keys().data_type(), &DataType::Int32);
+ assert_eq!(array.values().data_type(), &DataType::Utf8);
+ assert_eq!(
+ "DictionaryArray {keys: PrimitiveArray<Int32>\n[\n 0,\n 2,\n 0,\n 1,\n] values: StringArray\n[\n \"foo\",\n \"bar\",\n \"baz\",\n]}\n",
+ format!("{:?}", array)
+ );
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "Value at position 1 out of bounds: 3 (should be in [0, 1])"
+ )]
+ fn test_try_new_index_too_large() {
+ let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect();
+ // dictionary only has 2 values, so offset 3 is out of bounds
+ let keys: Int32Array = [Some(0), Some(3)].into_iter().collect();
+ DictionaryArray::<Int32Type>::try_new(&keys, &values).unwrap();
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "Value at position 0 out of bounds: -100 (should be in [0, 1])"
+ )]
+ fn test_try_new_index_too_small() {
+ let values: StringArray = [Some("foo"), Some("bar")].into_iter().collect();
+ let keys: Int32Array = [Some(-100)].into_iter().collect();
+ DictionaryArray::<Int32Type>::try_new(&keys, &values).unwrap();
+ }
}
diff --git a/arrow/src/array/equal/mod.rs b/arrow/src/array/equal/mod.rs
index 9743aa5..18d0ffe 100644
--- a/arrow/src/array/equal/mod.rs
+++ b/arrow/src/array/equal/mod.rs
@@ -20,7 +20,7 @@
//! depend on dynamic casting of `Array`.
use super::{
- Array, ArrayData, BinaryOffsetSizeTrait, BooleanArray, DecimalArray,
+ Array, ArrayData, BinaryOffsetSizeTrait, BooleanArray, DecimalArray, DictionaryArray,
FixedSizeBinaryArray, FixedSizeListArray, GenericBinaryArray, GenericListArray,
GenericStringArray, MapArray, NullArray, OffsetSizeTrait, PrimitiveArray,
StringOffsetSizeTrait, StructArray,
@@ -81,6 +81,12 @@ impl<T: ArrowPrimitiveType> PartialEq for PrimitiveArray<T> {
}
}
+impl<K: ArrowPrimitiveType> PartialEq for DictionaryArray<K> {
+ fn eq(&self, other: &Self) -> bool {
+ equal(self.data(), other.data())
+ }
+}
+
impl PartialEq for BooleanArray {
fn eq(&self, other: &BooleanArray) -> bool {
equal(self.data(), other.data())
diff --git a/arrow/src/compute/kernels/comparison.rs b/arrow/src/compute/kernels/comparison.rs
index 7b11b17..34a90f1 100644
--- a/arrow/src/compute/kernels/comparison.rs
+++ b/arrow/src/compute/kernels/comparison.rs
@@ -2699,7 +2699,6 @@ mod tests {
use super::*;
use crate::datatypes::Int8Type;
- use crate::datatypes::ToByteSlice;
use crate::{array::Int32Array, array::Int64Array, datatypes::Field};
/// Evaluate `KERNEL` with two vectors as inputs and assert against the expected output.
@@ -4664,41 +4663,15 @@ mod tests {
);
}
- fn get_dict_arraydata(
- keys: Buffer,
- key_type: DataType,
- value_data: ArrayData,
- ) -> ArrayData {
- let value_type = value_data.data_type().clone();
- let dict_data_type =
- DataType::Dictionary(Box::new(key_type), Box::new(value_type));
- ArrayData::builder(dict_data_type)
- .len(3)
- .add_buffer(keys)
- .add_child_data(value_data)
- .build()
- .unwrap()
- }
-
#[test]
fn test_eq_dyn_dictionary_i8_array() {
- let key_type = DataType::Int8;
// Construct a value array
- let value_data = ArrayData::builder(DataType::Int8)
- .len(8)
- .add_buffer(Buffer::from(
- &[10_i8, 11, 12, 13, 14, 15, 16, 17].to_byte_slice(),
- ))
- .build()
- .unwrap();
+ let values = Int8Array::from_iter_values([10_i8, 11, 12, 13, 14, 15, 16, 17]);
- let keys1 = Buffer::from(&[2_i8, 3, 4].to_byte_slice());
- let keys2 = Buffer::from(&[2_i8, 4, 4].to_byte_slice());
- let dict_array1: DictionaryArray<Int8Type> = Int8DictionaryArray::from(
- get_dict_arraydata(keys1, key_type.clone(), value_data.clone()),
- );
- let dict_array2: DictionaryArray<Int8Type> =
- Int8DictionaryArray::from(get_dict_arraydata(keys2, key_type, value_data));
+ let keys1 = Int8Array::from_iter_values([2_i8, 3, 4]);
+ let keys2 = Int8Array::from_iter_values([2_i8, 4, 4]);
+ let dict_array1 = DictionaryArray::try_new(&keys1, &values).unwrap();
+ let dict_array2 = DictionaryArray::try_new(&keys2, &values).unwrap();
let result = eq_dyn(&dict_array1, &dict_array2);
assert!(result.is_ok());
@@ -4707,23 +4680,14 @@ mod tests {
#[test]
fn test_eq_dyn_dictionary_u64_array() {
- let key_type = DataType::UInt64;
- // Construct a value array
- let value_data = ArrayData::builder(DataType::UInt64)
- .len(8)
- .add_buffer(Buffer::from(
- &[10_u64, 11, 12, 13, 14, 15, 16, 17].to_byte_slice(),
- ))
- .build()
- .unwrap();
+ let values = UInt64Array::from_iter_values([10_u64, 11, 12, 13, 14, 15, 16, 17]);
- let keys1 = Buffer::from(&[1_u64, 3, 4].to_byte_slice());
- let keys2 = Buffer::from(&[2_u64, 3, 5].to_byte_slice());
- let dict_array1: DictionaryArray<UInt64Type> = UInt64DictionaryArray::from(
- get_dict_arraydata(keys1, key_type.clone(), value_data.clone()),
- );
- let dict_array2: DictionaryArray<UInt64Type> =
- UInt64DictionaryArray::from(get_dict_arraydata(keys2, key_type, value_data));
+ let keys1 = UInt64Array::from_iter_values([1_u64, 3, 4]);
+ let keys2 = UInt64Array::from_iter_values([2_u64, 3, 5]);
+ let dict_array1 =
+ DictionaryArray::<UInt64Type>::try_new(&keys1, &values).unwrap();
+ let dict_array2 =
+ DictionaryArray::<UInt64Type>::try_new(&keys2, &values).unwrap();
let result = eq_dyn(&dict_array1, &dict_array2);
assert!(result.is_ok());
@@ -4757,29 +4721,17 @@ mod tests {
#[test]
fn test_eq_dyn_dictionary_binary_array() {
- let key_type = DataType::UInt64;
-
- // Construct a value array
- let values: [u8; 12] = [
- b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't',
- ];
- let offsets: [i32; 4] = [0, 5, 5, 12];
-
- // Array data: ["hello", "", "parquet"]
- let value_data = ArrayData::builder(DataType::Binary)
- .len(3)
- .add_buffer(Buffer::from_slice_ref(&offsets))
- .add_buffer(Buffer::from_slice_ref(&values))
- .build()
- .unwrap();
+ let values: BinaryArray = ["hello", "", "parquet"]
+ .into_iter()
+ .map(|b| Some(b.as_bytes()))
+ .collect();
- let keys1 = Buffer::from(&[0_u64, 1, 2].to_byte_slice());
- let keys2 = Buffer::from(&[0_u64, 2, 1].to_byte_slice());
- let dict_array1: DictionaryArray<UInt64Type> = UInt64DictionaryArray::from(
- get_dict_arraydata(keys1, key_type.clone(), value_data.clone()),
- );
- let dict_array2: DictionaryArray<UInt64Type> =
- UInt64DictionaryArray::from(get_dict_arraydata(keys2, key_type, value_data));
+ let keys1 = UInt64Array::from_iter_values([0_u64, 1, 2]);
+ let keys2 = UInt64Array::from_iter_values([0_u64, 2, 1]);
+ let dict_array1 =
+ DictionaryArray::<UInt64Type>::try_new(&keys1, &values).unwrap();
+ let dict_array2 =
+ DictionaryArray::<UInt64Type>::try_new(&keys2, &values).unwrap();
let result = eq_dyn(&dict_array1, &dict_array2);
assert!(result.is_ok());
@@ -4791,18 +4743,14 @@ mod tests {
#[test]
fn test_eq_dyn_dictionary_interval_array() {
- let key_type = DataType::UInt64;
+ let values = IntervalDayTimeArray::from(vec![1, 6, 10, 2, 3, 5]);
- let value_array = IntervalDayTimeArray::from(vec![1, 6, 10, 2, 3, 5]);
- let value_data = value_array.data().clone();
-
- let keys1 = Buffer::from(&[1_u64, 0, 3].to_byte_slice());
- let keys2 = Buffer::from(&[2_u64, 0, 3].to_byte_slice());
- let dict_array1: DictionaryArray<UInt64Type> = UInt64DictionaryArray::from(
- get_dict_arraydata(keys1, key_type.clone(), value_data.clone()),
- );
- let dict_array2: DictionaryArray<UInt64Type> =
- UInt64DictionaryArray::from(get_dict_arraydata(keys2, key_type, value_data));
+ let keys1 = UInt64Array::from_iter_values([1_u64, 0, 3]);
+ let keys2 = UInt64Array::from_iter_values([2_u64, 0, 3]);
+ let dict_array1 =
+ DictionaryArray::<UInt64Type>::try_new(&keys1, &values).unwrap();
+ let dict_array2 =
+ DictionaryArray::<UInt64Type>::try_new(&keys2, &values).unwrap();
let result = eq_dyn(&dict_array1, &dict_array2);
assert!(result.is_ok());
@@ -4811,18 +4759,14 @@ mod tests {
#[test]
fn test_eq_dyn_dictionary_date_array() {
- let key_type = DataType::UInt64;
-
- let value_array = Date32Array::from(vec![1, 6, 10, 2, 3, 5]);
- let value_data = value_array.data().clone();
+ let values = Date32Array::from(vec![1, 6, 10, 2, 3, 5]);
- let keys1 = Buffer::from(&[1_u64, 0, 3].to_byte_slice());
- let keys2 = Buffer::from(&[2_u64, 0, 3].to_byte_slice());
- let dict_array1: DictionaryArray<UInt64Type> = UInt64DictionaryArray::from(
- get_dict_arraydata(keys1, key_type.clone(), value_data.clone()),
- );
- let dict_array2: DictionaryArray<UInt64Type> =
- UInt64DictionaryArray::from(get_dict_arraydata(keys2, key_type, value_data));
+ let keys1 = UInt64Array::from_iter_values([1_u64, 0, 3]);
+ let keys2 = UInt64Array::from_iter_values([2_u64, 0, 3]);
+ let dict_array1 =
+ DictionaryArray::<UInt64Type>::try_new(&keys1, &values).unwrap();
+ let dict_array2 =
+ DictionaryArray::<UInt64Type>::try_new(&keys2, &values).unwrap();
let result = eq_dyn(&dict_array1, &dict_array2);
assert!(result.is_ok());
@@ -4831,18 +4775,14 @@ mod tests {
#[test]
fn test_eq_dyn_dictionary_bool_array() {
- let key_type = DataType::UInt64;
-
- let value_array = BooleanArray::from(vec![true, false]);
- let value_data = value_array.data().clone();
-
- let keys1 = Buffer::from(&[1_u64, 1, 1].to_byte_slice());
- let keys2 = Buffer::from(&[0_u64, 1, 0].to_byte_slice());
- let dict_array1: DictionaryArray<UInt64Type> = UInt64DictionaryArray::from(
- get_dict_arraydata(keys1, key_type.clone(), value_data.clone()),
- );
- let dict_array2: DictionaryArray<UInt64Type> =
- UInt64DictionaryArray::from(get_dict_arraydata(keys2, key_type, value_data));
+ let values = BooleanArray::from(vec![true, false]);
+
+ let keys1 = UInt64Array::from_iter_values([1_u64, 1, 1]);
+ let keys2 = UInt64Array::from_iter_values([0_u64, 1, 0]);
+ let dict_array1 =
+ DictionaryArray::<UInt64Type>::try_new(&keys1, &values).unwrap();
+ let dict_array2 =
+ DictionaryArray::<UInt64Type>::try_new(&keys2, &values).unwrap();
let result = eq_dyn(&dict_array1, &dict_array2);
assert!(result.is_ok());