You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/05/06 18:45:06 UTC
[arrow-rs] branch master updated: Remove parquet dictionary converters (#1661) (#1662)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 1201cb57a Remove parquet dictionary converters (#1661) (#1662)
1201cb57a is described below
commit 1201cb57ade6dcea12e57e990b20b0fab0274772
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Fri May 6 19:45:00 2022 +0100
Remove parquet dictionary converters (#1661) (#1662)
---
parquet/src/arrow/converter.rs | 131 +++--------------------------------------
1 file changed, 9 insertions(+), 122 deletions(-)
diff --git a/parquet/src/arrow/converter.rs b/parquet/src/arrow/converter.rs
index 169ee7b67..51e1d8290 100644
--- a/parquet/src/arrow/converter.rs
+++ b/parquet/src/arrow/converter.rs
@@ -15,30 +15,20 @@
// specific language governing permissions and limitations
// under the License.
-use crate::data_type::{ByteArray, DataType, FixedLenByteArray, Int96};
-// TODO: clean up imports (best done when there are few moving parts)
+use crate::data_type::{ByteArray, FixedLenByteArray, Int96};
use arrow::array::{
- Array, ArrayRef, BinaryBuilder, FixedSizeBinaryBuilder,
- IntervalDayTimeArray, IntervalDayTimeBuilder, IntervalYearMonthArray,
- IntervalYearMonthBuilder, LargeBinaryBuilder, LargeStringBuilder, PrimitiveBuilder,
- PrimitiveDictionaryBuilder, StringBuilder, StringDictionaryBuilder,
+ Array, ArrayRef, BinaryArray, BinaryBuilder, DecimalArray, FixedSizeBinaryArray,
+ FixedSizeBinaryBuilder, IntervalDayTimeArray, IntervalDayTimeBuilder,
+ IntervalYearMonthArray, IntervalYearMonthBuilder, LargeBinaryArray,
+ LargeBinaryBuilder, LargeStringArray, LargeStringBuilder, StringArray, StringBuilder,
+ TimestampNanosecondArray,
};
-use arrow::compute::cast;
use std::convert::{From, TryInto};
use std::sync::Arc;
use crate::errors::Result;
-use arrow::datatypes::{ArrowDictionaryKeyType, ArrowPrimitiveType};
-
-use arrow::array::{
- BinaryArray, DecimalArray, DictionaryArray, FixedSizeBinaryArray, LargeBinaryArray,
- LargeStringArray, PrimitiveArray, StringArray, TimestampNanosecondArray,
-};
use std::marker::PhantomData;
-use crate::data_type::Int32Type as ParquetInt32Type;
-use arrow::datatypes::Int32Type;
-
/// A converter is used to consume record reader's content and convert it to arrow
/// primitive array.
pub trait Converter<S, T> {
@@ -100,13 +90,11 @@ impl DecimalArrayConverter {
impl Converter<Vec<Option<FixedLenByteArray>>, DecimalArray> for DecimalArrayConverter {
fn convert(&self, source: Vec<Option<FixedLenByteArray>>) -> Result<DecimalArray> {
- let array = source.into_iter()
+ let array = source
+ .into_iter()
.map(|array| array.map(|array| Self::from_bytes_to_i128(array.data())))
.collect::<DecimalArray>()
- .with_precision_and_scale(
- self.precision as usize,
- self.scale as usize
- )?;
+ .with_precision_and_scale(self.precision as usize, self.scale as usize)?;
Ok(array)
}
@@ -251,92 +239,6 @@ impl Converter<Vec<Option<ByteArray>>, LargeBinaryArray> for LargeBinaryArrayCon
}
}
-pub struct StringDictionaryArrayConverter {}
-
-impl<K: ArrowDictionaryKeyType> Converter<Vec<Option<ByteArray>>, DictionaryArray<K>>
- for StringDictionaryArrayConverter
-{
- fn convert(&self, source: Vec<Option<ByteArray>>) -> Result<DictionaryArray<K>> {
- let data_size = source
- .iter()
- .map(|x| x.as_ref().map(|b| b.len()).unwrap_or(0))
- .sum();
-
- let keys_builder = PrimitiveBuilder::<K>::new(source.len());
- let values_builder = StringBuilder::with_capacity(source.len(), data_size);
-
- let mut builder = StringDictionaryBuilder::new(keys_builder, values_builder);
- for v in source {
- match v {
- Some(array) => {
- let _ = builder.append(array.as_utf8()?)?;
- }
- None => builder.append_null()?,
- }
- }
-
- Ok(builder.finish())
- }
-}
-
-pub struct DictionaryArrayConverter<DictValueSourceType, DictValueTargetType, ParquetType>
-{
- _dict_value_source_marker: PhantomData<DictValueSourceType>,
- _dict_value_target_marker: PhantomData<DictValueTargetType>,
- _parquet_marker: PhantomData<ParquetType>,
-}
-
-impl<DictValueSourceType, DictValueTargetType, ParquetType>
- DictionaryArrayConverter<DictValueSourceType, DictValueTargetType, ParquetType>
-{
- pub fn new() -> Self {
- Self {
- _dict_value_source_marker: PhantomData,
- _dict_value_target_marker: PhantomData,
- _parquet_marker: PhantomData,
- }
- }
-}
-
-impl<K, DictValueSourceType, DictValueTargetType, ParquetType>
- Converter<Vec<Option<<ParquetType as DataType>::T>>, DictionaryArray<K>>
- for DictionaryArrayConverter<DictValueSourceType, DictValueTargetType, ParquetType>
-where
- K: ArrowPrimitiveType,
- DictValueSourceType: ArrowPrimitiveType,
- DictValueTargetType: ArrowPrimitiveType,
- ParquetType: DataType,
- PrimitiveArray<DictValueSourceType>: From<Vec<Option<<ParquetType as DataType>::T>>>,
-{
- fn convert(
- &self,
- source: Vec<Option<<ParquetType as DataType>::T>>,
- ) -> Result<DictionaryArray<K>> {
- let keys_builder = PrimitiveBuilder::<K>::new(source.len());
- let values_builder = PrimitiveBuilder::<DictValueTargetType>::new(source.len());
-
- let mut builder = PrimitiveDictionaryBuilder::new(keys_builder, values_builder);
-
- let source_array: Arc<dyn Array> =
- Arc::new(PrimitiveArray::<DictValueSourceType>::from(source));
- let target_array = cast(&source_array, &DictValueTargetType::DATA_TYPE)?;
- let target = target_array
- .as_any()
- .downcast_ref::<PrimitiveArray<DictValueTargetType>>()
- .unwrap();
-
- for i in 0..target.len() {
- if target.is_null(i) {
- builder.append_null()?;
- } else {
- let _ = builder.append(target.value(i))?;
- }
- }
-
- Ok(builder.finish())
- }
-}
-
pub type Utf8Converter =
ArrayRefConverter<Vec<Option<ByteArray>>, StringArray, Utf8ArrayConverter>;
pub type LargeUtf8Converter =
@@ -348,21 +250,6 @@ pub type LargeBinaryConverter = ArrayRefConverter<
LargeBinaryArray,
LargeBinaryArrayConverter,
>;
-pub type StringDictionaryConverter<T> = ArrayRefConverter<
- Vec<Option<ByteArray>>,
- DictionaryArray<T>,
- StringDictionaryArrayConverter,
->;
-pub type DictionaryConverter<K, SV, TV, P> = ArrayRefConverter<
- Vec<Option<<P as DataType>::T>>,
- DictionaryArray<K>,
- DictionaryArrayConverter<SV, TV, P>,
->;
-pub type PrimitiveDictionaryConverter<K, V> = ArrayRefConverter<
- Vec<Option<<ParquetInt32Type as DataType>::T>>,
- DictionaryArray<K>,
- DictionaryArrayConverter<Int32Type, V, ParquetInt32Type>,
->;
pub type Int96Converter =
ArrayRefConverter<Vec<Option<Int96>>, TimestampNanosecondArray, Int96ArrayConverter>;