You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ne...@apache.org on 2020/11/07 12:42:56 UTC
[arrow] branch master updated: ARROW-10449 [Rust] Make
Dictionary::keys be an array
This is an automated email from the ASF dual-hosted git repository.
nevime pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 2e284f4 ARROW-10449 [Rust] Make Dictionary::keys be an array
2e284f4 is described below
commit 2e284f48e8a57d832fd45f93b875d161709163a0
Author: Jorge C. Leitao <jo...@gmail.com>
AuthorDate: Sat Nov 7 14:42:17 2020 +0200
ARROW-10449 [Rust] Make Dictionary::keys be an array
This PR:
1. Makes `DictionaryArray::keys` be an `PrimitiveArray`.
2. Removes `NullIter` and many of the `unsafe` code that it contained
3. Simplifies the parquet writer implementation around dictionaries
4. Indirectly removes a bug on NullIter's `DoubleEndedIterator` on which the implementation was not following the spec with respect to `end == current` (compare with implementation of `DoubleEndedIterator` for std's `Vec`'s `IntoIter`).
5. Fixes error in computing the size of a dictionary, which was double-counting `data` and `values` (`values` is a reference to `data`)
6. Adds check that the dictionary's ArrayData's datatype matches `T::DATA_TYPE`.
Since `NullIter` was not being directly tested, no tests were removed.
This is backward incompatible and the migration requires replacing `dict.keys()` by `dict.keys().into_iter()` whenever the user's intention is to use `keys()` as an iterator.
Closes #8561 from jorgecarleitao/dictionary_clean
Authored-by: Jorge C. Leitao <jo...@gmail.com>
Signed-off-by: Neville Dipale <ne...@gmail.com>
---
rust/arrow/src/array/array.rs | 215 +++++++------------------------
rust/arrow/src/array/builder.rs | 27 ++--
rust/arrow/src/array/equal.rs | 15 +--
rust/arrow/src/compute/kernels/filter.rs | 5 +-
rust/arrow/src/compute/kernels/take.rs | 7 +-
rust/arrow/src/json/reader.rs | 7 +-
rust/parquet/src/arrow/arrow_writer.rs | 99 +++++++-------
7 files changed, 111 insertions(+), 264 deletions(-)
diff --git a/rust/arrow/src/array/array.rs b/rust/arrow/src/array/array.rs
index e168f75..a6a976f 100644
--- a/rust/arrow/src/array/array.rs
+++ b/rust/arrow/src/array/array.rs
@@ -2129,28 +2129,32 @@ impl From<(Vec<(Field, ArrayRef)>, Buffer, usize)> for StructArray {
/// Example **with nullable** data:
///
/// ```
-/// use arrow::array::DictionaryArray;
+/// use arrow::array::{DictionaryArray, Int8Array};
/// use arrow::datatypes::Int8Type;
/// let test = vec!["a", "a", "b", "c"];
/// let array : DictionaryArray<Int8Type> = test.iter().map(|&x| if x == "b" {None} else {Some(x)}).collect();
-/// assert_eq!(array.keys().collect::<Vec<Option<i8>>>(), vec![Some(0), Some(0), None, Some(1)]);
+/// assert_eq!(array.keys(), &Int8Array::from(vec![Some(0), Some(0), None, Some(1)]));
/// ```
///
/// Example **without nullable** data:
///
/// ```
-/// use arrow::array::DictionaryArray;
+/// use arrow::array::{DictionaryArray, Int8Array};
/// use arrow::datatypes::Int8Type;
/// let test = vec!["a", "a", "b", "c"];
/// let array : DictionaryArray<Int8Type> = test.into_iter().collect();
-/// assert_eq!(array.keys().collect::<Vec<Option<i8>>>(), vec![Some(0), Some(0), Some(1), Some(2)]);
+/// assert_eq!(array.keys(), &Int8Array::from(vec![0, 0, 1, 2]));
/// ```
pub struct DictionaryArray<K: ArrowPrimitiveType> {
- /// Array of keys, stored as a PrimitiveArray<K>.
+ /// Data of this dictionary. Note that this is _not_ compatible with the C Data interface,
+ /// as, in the current implementation, `values` below are the first child of this struct.
data: ArrayDataRef,
- /// Pointer to the key values.
- raw_values: RawPtrBox<K::Native>,
+ /// The keys of this dictionary. These are constructed from the buffer and null bitmap
+ /// of `data`.
+ /// Also, note that these do not correspond to the true values of this array. Rather, they map
+ /// to the real values.
+ keys: PrimitiveArray<K>,
/// Array of dictionary values (can by any DataType).
values: ArrayRef,
@@ -2159,112 +2163,10 @@ pub struct DictionaryArray<K: ArrowPrimitiveType> {
is_ordered: bool,
}
-#[derive(Debug)]
-enum Draining {
- Ready,
- Iterating,
- Finished,
-}
-
-#[derive(Debug)]
-pub struct NullableIter<'a, T> {
- data: &'a ArrayDataRef, // TODO: Use a pointer to the null bitmap.
- ptr: *const T,
- i: usize,
- len: usize,
- draining: Draining,
-}
-
-impl<'a, T> std::iter::Iterator for NullableIter<'a, T>
-where
- T: Clone,
-{
- type Item = Option<T>;
-
- fn next(&mut self) -> Option<Self::Item> {
- let i = self.i;
- if i >= self.len {
- None
- } else if self.data.is_null(i) {
- self.i += 1;
- Some(None)
- } else {
- self.i += 1;
- unsafe { Some(Some((&*self.ptr.add(i)).clone())) }
- }
- }
-
- fn size_hint(&self) -> (usize, Option<usize>) {
- (self.len, Some(self.len))
- }
-
- fn nth(&mut self, n: usize) -> Option<Self::Item> {
- let i = self.i;
- if i + n >= self.len {
- self.i = self.len;
- None
- } else if self.data.is_null(i + n) {
- self.i += n + 1;
- Some(None)
- } else {
- self.i += n + 1;
- unsafe { Some(Some((&*self.ptr.add(i + n)).clone())) }
- }
- }
-}
-
-impl<'a, T> std::iter::DoubleEndedIterator for NullableIter<'a, T>
-where
- T: Clone,
-{
- fn next_back(&mut self) -> Option<Self::Item> {
- match self.draining {
- Draining::Ready => {
- self.draining = Draining::Iterating;
- self.i = self.len - 1;
- self.next_back()
- }
- Draining::Iterating => {
- let i = self.i;
- if i >= self.len {
- None
- } else if self.data.is_null(i) {
- self.i = self.i.checked_sub(1).unwrap_or_else(|| {
- self.draining = Draining::Finished;
- 0_usize
- });
- Some(None)
- } else {
- match i.checked_sub(1) {
- Some(idx) => {
- self.i = idx;
- unsafe { Some(Some((&*self.ptr.add(i)).clone())) }
- }
- _ => {
- self.draining = Draining::Finished;
- unsafe { Some(Some((&*self.ptr).clone())) }
- }
- }
- }
- }
- Draining::Finished => {
- self.draining = Draining::Ready;
- None
- }
- }
- }
-}
-
impl<'a, K: ArrowPrimitiveType> DictionaryArray<K> {
/// Return an iterator to the keys of this dictionary.
- pub fn keys(&self) -> NullableIter<'_, K::Native> {
- NullableIter::<'_, K::Native> {
- data: &self.data,
- ptr: unsafe { self.raw_values.get().add(self.data.offset()) },
- i: 0,
- len: self.data.len(),
- draining: Draining::Ready,
- }
+ pub fn keys(&self) -> &PrimitiveArray<K> {
+ &self.keys
}
/// Returns an array view of the keys of this dictionary
@@ -2305,12 +2207,12 @@ impl<'a, K: ArrowPrimitiveType> DictionaryArray<K> {
/// The length of the dictionary is the length of the keys array.
pub fn len(&self) -> usize {
- self.data.len()
+ self.keys.len()
}
/// Whether this dictionary is empty
pub fn is_empty(&self) -> bool {
- self.data.is_empty()
+ self.keys.is_empty()
}
// Currently exists for compatibility purposes with Arrow IPC.
@@ -2333,13 +2235,24 @@ impl<T: ArrowPrimitiveType> From<ArrayDataRef> for DictionaryArray<T> {
"DictionaryArray should contain a single child array (values)."
);
- let raw_values = data.buffers()[0].raw_data();
- let dtype: &DataType = data.data_type();
- let values = make_array(data.child_data()[0].clone());
- if let DataType::Dictionary(_, _) = dtype {
+ if let DataType::Dictionary(key_data_type, _) = data.data_type() {
+ if key_data_type.as_ref() != &T::DATA_TYPE {
+ panic!("DictionaryArray's data type must match.")
+ };
+ // create a zero-copy of the keys' data
+ let keys = PrimitiveArray::<T>::from(Arc::new(ArrayData::new(
+ T::DATA_TYPE,
+ data.len(),
+ Some(data.null_count()),
+ data.null_buffer().cloned(),
+ data.offset(),
+ data.buffers().to_vec(),
+ vec![],
+ )));
+ let values = make_array(data.child_data()[0].clone());
Self {
data,
- raw_values: RawPtrBox::new(raw_values as *const T::Native),
+ keys,
values,
is_ordered: false,
}
@@ -2410,32 +2323,25 @@ impl<T: ArrowPrimitiveType> Array for DictionaryArray<T> {
&self.data
}
- /// Returns the total number of bytes of memory occupied by the buffers owned by this [DictionaryArray].
fn get_buffer_memory_size(&self) -> usize {
- self.data.get_buffer_memory_size() + self.values().get_buffer_memory_size()
+ // Since both `keys` and `values` derive (are references from) `data`, we only need to account for `data`.
+ self.data.get_buffer_memory_size()
}
- /// Returns the total number of bytes of memory occupied physically by this [DictionaryArray].
fn get_array_memory_size(&self) -> usize {
self.data.get_array_memory_size()
- + self.values().get_array_memory_size()
+ + self.keys.get_array_memory_size()
+ + self.values.get_array_memory_size()
+ mem::size_of_val(self)
}
}
impl<T: ArrowPrimitiveType> fmt::Debug for DictionaryArray<T> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
- const MAX_LEN: usize = 10;
- let keys: Vec<_> = self.keys().take(MAX_LEN).collect();
- let elipsis = if self.keys().count() > MAX_LEN {
- "..."
- } else {
- ""
- };
writeln!(
f,
- "DictionaryArray {{keys: {:?}{} values: {:?}}}",
- keys, elipsis, self.values
+ "DictionaryArray {{keys: {:?} values: {:?}}}",
+ self.keys, self.values
)
}
}
@@ -3118,23 +3024,7 @@ mod tests {
// Null count only makes sense in terms of the component arrays.
assert_eq!(0, dict_array.null_count());
assert_eq!(0, dict_array.values().null_count());
- assert_eq!(Some(Some(3)), dict_array.keys().nth(1));
- assert_eq!(Some(Some(4)), dict_array.keys().nth(2));
-
- assert_eq!(
- dict_array.keys().collect::<Vec<Option<i16>>>(),
- vec![Some(2), Some(3), Some(4)]
- );
-
- assert_eq!(
- dict_array.keys().rev().collect::<Vec<Option<i16>>>(),
- vec![Some(4), Some(3), Some(2)]
- );
-
- assert_eq!(
- dict_array.keys().rev().rev().collect::<Vec<Option<i16>>>(),
- vec![Some(2), Some(3), Some(4)]
- );
+ assert_eq!(dict_array.keys(), &Int16Array::from(vec![2_i16, 3, 4]));
// Now test with a non-zero offset
let dict_data = ArrayData::builder(dict_data_type)
@@ -3149,26 +3039,7 @@ mod tests {
assert_eq!(value_data, values.data());
assert_eq!(DataType::Int8, dict_array.value_type());
assert_eq!(2, dict_array.len());
- assert_eq!(Some(Some(3)), dict_array.keys().nth(0));
- assert_eq!(Some(Some(4)), dict_array.keys().nth(1));
-
- assert_eq!(
- dict_array.keys().collect::<Vec<Option<i16>>>(),
- vec![Some(3), Some(4)]
- );
- }
-
- #[test]
- fn test_dictionary_array_key_reverse() {
- let test = vec!["a", "a", "b", "c"];
- let array: DictionaryArray<Int8Type> = test
- .iter()
- .map(|&x| if x == "b" { None } else { Some(x) })
- .collect();
- assert_eq!(
- array.keys().rev().collect::<Vec<Option<i8>>>(),
- vec![Some(1), None, Some(0), Some(0)]
- );
+ assert_eq!(dict_array.keys(), &Int16Array::from(vec![3_i16, 4]));
}
#[test]
@@ -4307,7 +4178,7 @@ mod tests {
builder.append(22345678).unwrap();
let array = builder.finish();
assert_eq!(
- "DictionaryArray {keys: [Some(0), None, Some(1)] values: PrimitiveArray<UInt32>\n[\n 12345678,\n 22345678,\n]}\n",
+ "DictionaryArray {keys: PrimitiveArray<UInt8>\n[\n 0,\n null,\n 1,\n] values: PrimitiveArray<UInt32>\n[\n 12345678,\n 22345678,\n]}\n",
format!("{:?}", array)
);
@@ -4319,7 +4190,7 @@ mod tests {
}
let array = builder.finish();
assert_eq!(
- "DictionaryArray {keys: [Some(0), Some(0), Some(0), Some(0), Some(0), Some(0), Some(0), Some(0), Some(0), Some(0)]... values: PrimitiveArray<UInt32>\n[\n 1,\n]}\n",
+ "DictionaryArray {keys: PrimitiveArray<UInt8>\n[\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n 0,\n] values: PrimitiveArray<UInt32>\n[\n 1,\n]}\n",
format!("{:?}", array)
);
}
@@ -4332,13 +4203,13 @@ mod tests {
.map(|&x| if x == "b" { None } else { Some(x) })
.collect();
assert_eq!(
- "DictionaryArray {keys: [Some(0), Some(0), None, Some(1)] values: StringArray\n[\n \"a\",\n \"c\",\n]}\n",
+ "DictionaryArray {keys: PrimitiveArray<Int8>\n[\n 0,\n 0,\n null,\n 1,\n] values: StringArray\n[\n \"a\",\n \"c\",\n]}\n",
format!("{:?}", array)
);
let array: DictionaryArray<Int8Type> = test.into_iter().collect();
assert_eq!(
- "DictionaryArray {keys: [Some(0), Some(0), Some(1), Some(2)] values: StringArray\n[\n \"a\",\n \"b\",\n \"c\",\n]}\n",
+ "DictionaryArray {keys: PrimitiveArray<Int8>\n[\n 0,\n 0,\n 1,\n 2,\n] values: StringArray\n[\n \"a\",\n \"b\",\n \"c\",\n]}\n",
format!("{:?}", array)
);
}
diff --git a/rust/arrow/src/array/builder.rs b/rust/arrow/src/array/builder.rs
index 09fb9e7..21ddb7d 100644
--- a/rust/arrow/src/array/builder.rs
+++ b/rust/arrow/src/array/builder.rs
@@ -2326,7 +2326,7 @@ where
///
/// ```
/// use arrow::datatypes::Int16Type;
- /// use arrow::array::{StringArray, StringDictionaryBuilder, PrimitiveBuilder};
+ /// use arrow::array::{StringArray, StringDictionaryBuilder, PrimitiveBuilder, Int16Array};
/// use std::convert::TryFrom;
///
/// let dictionary_values = StringArray::from(vec![None, Some("abc"), Some("def")]);
@@ -2338,9 +2338,9 @@ where
///
/// let dictionary_array = builder.finish();
///
- /// let keys: Vec<Option<i16>> = dictionary_array.keys().collect();
+ /// let keys = dictionary_array.keys();
///
- /// assert_eq!(keys, vec![Some(2), None, Some(1)]);
+ /// assert_eq!(keys, &Int16Array::from(vec![Some(2), None, Some(1)]));
/// ```
pub fn new_with_dictionary(
keys_builder: PrimitiveBuilder<K>,
@@ -3430,8 +3430,10 @@ mod tests {
builder.append(22345678).unwrap();
let array = builder.finish();
- // Keys are strongly typed.
- let aks: Vec<_> = array.keys().collect();
+ assert_eq!(
+ array.keys(),
+ &UInt8Array::from(vec![Some(0), None, Some(1)])
+ );
// Values are polymorphic and so require a downcast.
let av = array.values();
@@ -3442,7 +3444,6 @@ mod tests {
assert_eq!(array.is_null(1), true);
assert_eq!(array.is_null(2), false);
- assert_eq!(aks, vec![Some(0), None, Some(1)]);
assert_eq!(avs, &[12345678, 22345678]);
}
@@ -3458,14 +3459,15 @@ mod tests {
builder.append("abc").unwrap();
let array = builder.finish();
- // Keys are strongly typed.
- let aks: Vec<_> = array.keys().collect();
+ assert_eq!(
+ array.keys(),
+ &Int8Array::from(vec![Some(0), None, Some(1), Some(1), Some(0)])
+ );
// Values are polymorphic and so require a downcast.
let av = array.values();
let ava: &StringArray = av.as_any().downcast_ref::<StringArray>().unwrap();
- assert_eq!(aks, vec![Some(0), None, Some(1), Some(1), Some(0)]);
assert_eq!(ava.value(0), "abc");
assert_eq!(ava.value(1), "def");
}
@@ -3486,14 +3488,15 @@ mod tests {
builder.append("ghi").unwrap();
let array = builder.finish();
- // Keys are strongly typed.
- let aks: Vec<_> = array.keys().collect();
+ assert_eq!(
+ array.keys(),
+ &Int8Array::from(vec![Some(2), None, Some(1), Some(1), Some(2), Some(3)])
+ );
// Values are polymorphic and so require a downcast.
let av = array.values();
let ava: &StringArray = av.as_any().downcast_ref::<StringArray>().unwrap();
- assert_eq!(aks, vec![Some(2), None, Some(1), Some(1), Some(2), Some(3)]);
assert_eq!(ava.is_valid(0), false);
assert_eq!(ava.value(1), "def");
assert_eq!(ava.value(2), "abc");
diff --git a/rust/arrow/src/array/equal.rs b/rust/arrow/src/array/equal.rs
index 26b4cd4..9a0b7e6 100644
--- a/rust/arrow/src/array/equal.rs
+++ b/rust/arrow/src/array/equal.rs
@@ -250,11 +250,9 @@ impl<T: ArrowPrimitiveType> ArrayEqual for DictionaryArray<T> {
assert!(other_start_idx + (end_idx - start_idx) <= other.len());
let other = other.as_any().downcast_ref::<DictionaryArray<T>>().unwrap();
- let iter_a = self.keys().take(end_idx).skip(start_idx);
- let iter_b = other.keys().skip(other_start_idx);
-
// For now, all the values must be the same
- iter_a.eq(iter_b)
+ self.keys()
+ .range_equals(other.keys(), start_idx, end_idx, other_start_idx)
&& self
.values()
.range_equals(&*other.values(), 0, other.values().len(), 0)
@@ -910,13 +908,8 @@ impl<OffsetSize: OffsetSizeTrait> PartialEq<GenericListArray<OffsetSize>> for Va
impl<T: ArrowPrimitiveType> JsonEqual for DictionaryArray<T> {
fn equals_json(&self, json: &[&Value]) -> bool {
- self.keys().zip(json.iter()).all(|aj| match aj {
- (None, Value::Null) => true,
- (Some(a), Value::Number(j)) => {
- a.to_usize().unwrap() as u64 == j.as_u64().unwrap()
- }
- _ => false,
- })
+ // todo: this is wrong: we must test the values also
+ self.keys().equals_json(json)
}
}
diff --git a/rust/arrow/src/compute/kernels/filter.rs b/rust/arrow/src/compute/kernels/filter.rs
index 05d5348..0d19443 100644
--- a/rust/arrow/src/compute/kernels/filter.rs
+++ b/rust/arrow/src/compute/kernels/filter.rs
@@ -1052,10 +1052,7 @@ mod tests {
// but keys are filtered
assert_eq!(2, d.len());
assert_eq!(true, d.is_null(0));
- assert_eq!(
- "world",
- values.value(d.keys().nth(1).unwrap().unwrap() as usize)
- );
+ assert_eq!("world", values.value(d.keys().value(1) as usize));
}
#[test]
diff --git a/rust/arrow/src/compute/kernels/take.rs b/rust/arrow/src/compute/kernels/take.rs
index 83139c3..08016b9 100644
--- a/rust/arrow/src/compute/kernels/take.rs
+++ b/rust/arrow/src/compute/kernels/take.rs
@@ -1157,8 +1157,6 @@ mod tests {
assert_eq!(&expected_values, dict_values);
assert_eq!(&expected_values, &result_values);
- let result_keys: Int16Array = result.keys().collect::<Vec<_>>().into();
-
let expected_keys = Int16Array::from(vec![
Some(0),
Some(0),
@@ -1168,9 +1166,6 @@ mod tests {
Some(2),
None,
]);
-
- assert_eq!(expected_keys.len(), result_keys.len());
- assert_eq!(expected_keys.data_type(), result_keys.data_type());
- assert_eq!(expected_keys, result_keys);
+ assert_eq!(result.keys(), &expected_keys);
}
}
diff --git a/rust/arrow/src/json/reader.rs b/rust/arrow/src/json/reader.rs
index 8959458..7a414db 100644
--- a/rust/arrow/src/json/reader.rs
+++ b/rust/arrow/src/json/reader.rs
@@ -1603,10 +1603,9 @@ mod tests {
assert_eq!(true, dd.is_valid(2));
assert_eq!(false, dd.is_valid(11));
- let keys: Vec<_> = dd.keys().collect();
assert_eq!(
- keys,
- vec![
+ dd.keys(),
+ &Int16Array::from(vec![
None,
Some(0),
Some(1),
@@ -1619,7 +1618,7 @@ mod tests {
Some(0),
Some(0),
None
- ]
+ ])
);
}
diff --git a/rust/parquet/src/arrow/arrow_writer.rs b/rust/parquet/src/arrow/arrow_writer.rs
index f2279fb..c0b2105 100644
--- a/rust/parquet/src/arrow/arrow_writer.rs
+++ b/rust/parquet/src/arrow/arrow_writer.rs
@@ -177,11 +177,7 @@ fn write_leaves(
Ok(())
}
ArrowDataType::Dictionary(key_type, value_type) => {
- use arrow_array::{
- Int16DictionaryArray, Int32DictionaryArray, Int64DictionaryArray,
- Int8DictionaryArray, PrimitiveArray, StringArray, UInt16DictionaryArray,
- UInt32DictionaryArray, UInt64DictionaryArray, UInt8DictionaryArray,
- };
+ use arrow_array::{PrimitiveArray, StringArray};
use ArrowDataType::*;
use ColumnWriter::*;
@@ -217,9 +213,10 @@ fn write_leaves(
.unwrap();
use std::convert::TryFrom;
- // This removes NULL values from the NullableIter, but
+ // This removes NULL values from the keys, but
// they're encoded by the levels, so that's fine.
let materialized_values: Vec<_> = keys
+ .into_iter()
.flatten()
.map(|key| {
usize::try_from(key).unwrap_or_else(|k| {
@@ -250,14 +247,14 @@ fn write_leaves(
}
dispatch_dictionary!(
- Int8, Utf8, ByteArrayColumnWriter => Int8DictionaryArray, StringArray,
- Int16, Utf8, ByteArrayColumnWriter => Int16DictionaryArray, StringArray,
- Int32, Utf8, ByteArrayColumnWriter => Int32DictionaryArray, StringArray,
- Int64, Utf8, ByteArrayColumnWriter => Int64DictionaryArray, StringArray,
- UInt8, Utf8, ByteArrayColumnWriter => UInt8DictionaryArray, StringArray,
- UInt16, Utf8, ByteArrayColumnWriter => UInt16DictionaryArray, StringArray,
- UInt32, Utf8, ByteArrayColumnWriter => UInt32DictionaryArray, StringArray,
- UInt64, Utf8, ByteArrayColumnWriter => UInt64DictionaryArray, StringArray,
+ Int8, Utf8, ByteArrayColumnWriter => arrow::datatypes::Int8Type, StringArray,
+ Int16, Utf8, ByteArrayColumnWriter => arrow::datatypes::Int16Type, StringArray,
+ Int32, Utf8, ByteArrayColumnWriter => arrow::datatypes::Int32Type, StringArray,
+ Int64, Utf8, ByteArrayColumnWriter => arrow::datatypes::Int64Type, StringArray,
+ UInt8, Utf8, ByteArrayColumnWriter => arrow::datatypes::UInt8Type, StringArray,
+ UInt16, Utf8, ByteArrayColumnWriter => arrow::datatypes::UInt16Type, StringArray,
+ UInt32, Utf8, ByteArrayColumnWriter => arrow::datatypes::UInt32Type, StringArray,
+ UInt64, Utf8, ByteArrayColumnWriter => arrow::datatypes::UInt64Type, StringArray,
)?;
row_group_writer.close_column(col_writer)?;
@@ -283,48 +280,40 @@ trait Materialize<K, V> {
fn materialize(&self) -> Vec<Self::Output>;
}
-macro_rules! materialize_string {
- ($($k:ty,)*) => {
- $(impl Materialize<$k, arrow_array::StringArray> for dyn Array {
- type Output = ByteArray;
-
- fn materialize(&self) -> Vec<Self::Output> {
- use std::convert::TryFrom;
-
- let typed_array = self.as_any()
- .downcast_ref::<$k>()
- .expect("Unable to get dictionary array");
-
- let keys = typed_array.keys();
-
- let value_buffer = typed_array.values();
- let values = value_buffer
- .as_any()
- .downcast_ref::<arrow_array::StringArray>()
- .unwrap();
-
- // This removes NULL values from the NullableIter, but
- // they're encoded by the levels, so that's fine.
- keys
- .flatten()
- .map(|key| usize::try_from(key).unwrap_or_else(|k| panic!("key {} does not fit in usize", k)))
- .map(|key| values.value(key))
- .map(ByteArray::from)
- .collect()
- }
- })*
- };
-}
+impl<K> Materialize<K, arrow_array::StringArray> for dyn Array
+where
+ K: arrow::datatypes::ArrowDictionaryKeyType,
+{
+ type Output = ByteArray;
+
+ fn materialize(&self) -> Vec<Self::Output> {
+ use arrow::datatypes::ArrowNativeType;
-materialize_string! {
- arrow_array::Int8DictionaryArray,
- arrow_array::Int16DictionaryArray,
- arrow_array::Int32DictionaryArray,
- arrow_array::Int64DictionaryArray,
- arrow_array::UInt8DictionaryArray,
- arrow_array::UInt16DictionaryArray,
- arrow_array::UInt32DictionaryArray,
- arrow_array::UInt64DictionaryArray,
+ let typed_array = self
+ .as_any()
+ .downcast_ref::<arrow_array::DictionaryArray<K>>()
+ .expect("Unable to get dictionary array");
+
+ let keys = typed_array.keys();
+
+ let value_buffer = typed_array.values();
+ let values = value_buffer
+ .as_any()
+ .downcast_ref::<arrow_array::StringArray>()
+ .unwrap();
+
+ // This removes NULL values from the keys, but
+ // they're encoded by the levels, so that's fine.
+ keys.into_iter()
+ .flatten()
+ .map(|key| {
+ key.to_usize()
+ .unwrap_or_else(|| panic!("key {:?} does not fit in usize", key))
+ })
+ .map(|key| values.value(key))
+ .map(ByteArray::from)
+ .collect()
+ }
}
fn write_dict<K, V, T>(