You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2020/11/18 16:23:51 UTC

[GitHub] [arrow] alamb commented on a change in pull request #8698: ARROW-10636: [Rust][Parquet] Remove rust specialization

alamb commented on a change in pull request #8698:
URL: https://github.com/apache/arrow/pull/8698#discussion_r526207035



##########
File path: rust/parquet/src/column/writer.rs
##########
@@ -955,81 +963,31 @@ impl<T: DataType> ColumnWriterImpl<T> {
 /// Trait to define default encoding for types, including whether or not the type
 /// supports dictionary encoding.
 trait EncodingWriteSupport {
-    /// Returns encoding for a column when no other encoding is provided in writer
-    /// properties.
-    fn fallback_encoding(props: &WriterProperties) -> Encoding;
-
     /// Returns true if dictionary is supported for column writer, false otherwise.
     fn has_dictionary_support(props: &WriterProperties) -> bool;
 }
 
-// Basic implementation, always falls back to PLAIN and supports dictionary.
-impl<T: DataType> EncodingWriteSupport for ColumnWriterImpl<T> {
-    default fn fallback_encoding(_props: &WriterProperties) -> Encoding {
-        Encoding::PLAIN
-    }
-
-    default fn has_dictionary_support(_props: &WriterProperties) -> bool {
-        true
-    }
-}
-
-impl EncodingWriteSupport for ColumnWriterImpl<BoolType> {
-    fn fallback_encoding(props: &WriterProperties) -> Encoding {
-        match props.writer_version() {
-            WriterVersion::PARQUET_1_0 => Encoding::PLAIN,
-            WriterVersion::PARQUET_2_0 => Encoding::RLE,
-        }
-    }
-
-    // Boolean column does not support dictionary encoding and should fall back to
-    // whatever fallback encoding is defined.
-    fn has_dictionary_support(_props: &WriterProperties) -> bool {
-        false
-    }
-}
-
-impl EncodingWriteSupport for ColumnWriterImpl<Int32Type> {
-    fn fallback_encoding(props: &WriterProperties) -> Encoding {
-        match props.writer_version() {
-            WriterVersion::PARQUET_1_0 => Encoding::PLAIN,
-            WriterVersion::PARQUET_2_0 => Encoding::DELTA_BINARY_PACKED,
-        }
+/// Returns encoding for a column when no other encoding is provided in writer properties.
+fn fallback_encoding(kind: Type, props: &WriterProperties) -> Encoding {
+    match (kind, props.writer_version()) {
+        (Type::BOOLEAN, WriterVersion::PARQUET_2_0) => Encoding::RLE,
+        (Type::INT32, WriterVersion::PARQUET_2_0) => Encoding::DELTA_BINARY_PACKED,
+        (Type::INT64, WriterVersion::PARQUET_2_0) => Encoding::DELTA_BINARY_PACKED,
+        (Type::BYTE_ARRAY, WriterVersion::PARQUET_2_0) => Encoding::DELTA_BYTE_ARRAY,
+        (Type::FIXED_LEN_BYTE_ARRAY, WriterVersion::PARQUET_2_0) => Encoding::DELTA_BYTE_ARRAY,
+        _ => Encoding::PLAIN,
     }
 }
 
-impl EncodingWriteSupport for ColumnWriterImpl<Int64Type> {
-    fn fallback_encoding(props: &WriterProperties) -> Encoding {
-        match props.writer_version() {
-            WriterVersion::PARQUET_1_0 => Encoding::PLAIN,
-            WriterVersion::PARQUET_2_0 => Encoding::DELTA_BINARY_PACKED,
-        }
-    }
-}
-
-impl EncodingWriteSupport for ColumnWriterImpl<ByteArrayType> {
-    fn fallback_encoding(props: &WriterProperties) -> Encoding {
-        match props.writer_version() {
-            WriterVersion::PARQUET_1_0 => Encoding::PLAIN,
-            WriterVersion::PARQUET_2_0 => Encoding::DELTA_BYTE_ARRAY,
-        }
-    }
-}
-
-impl EncodingWriteSupport for ColumnWriterImpl<FixedLenByteArrayType> {
-    fn fallback_encoding(props: &WriterProperties) -> Encoding {
-        match props.writer_version() {
-            WriterVersion::PARQUET_1_0 => Encoding::PLAIN,
-            WriterVersion::PARQUET_2_0 => Encoding::DELTA_BYTE_ARRAY,
-        }
-    }
-
-    fn has_dictionary_support(props: &WriterProperties) -> bool {
-        match props.writer_version() {
-            // Dictionary encoding was not enabled in PARQUET 1.0
-            WriterVersion::PARQUET_1_0 => false,
-            WriterVersion::PARQUET_2_0 => true,
-        }
+/// Returns true if dictionary is supported for column writer, false otherwise.

Review comment:
       I carefully checked the logic in this file and I believe it matches the original without the need for specialization. 👍 

##########
File path: rust/parquet/src/data_type.rs
##########
@@ -424,16 +460,165 @@ impl AsBytes for str {
     }
 }
 
-/// Contains the Parquet physical type information as well as the Rust primitive type
-/// presentation.
-pub trait DataType: 'static {
-    type T: std::cmp::PartialEq
+pub(crate) mod private {
+    use super::{AsBytes, Result, ParquetError};
+
+    pub type BitIndex = u64;
+
+    #[derive(Copy, Clone)]
+    pub enum EncodedValue<'a> {
+        /// The value can be encoded from the following bytes
+        Bytes {
+            data: &'a [u8]
+        },
+        /// The value encodes as a specific set bit at a given index
+        Bits{
+            index: BitIndex
+        },
+    }
+
+    /// Sealed trait to start to remove specialisation from implementations
+    ///
+    /// This is done to force the associated value type to be unimplementable outside of this
+    /// crate, and thus hint to the type system (and end user) traits are public for the contract
+    /// and not for extension.
+    pub trait ParquetValueType : std::cmp::PartialEq
         + std::fmt::Debug
+        + std::fmt::Display
         + std::default::Default
         + std::clone::Clone
-        + AsBytes
-        + FromBytes
-        + PartialOrd;
+        + super::AsBytes
+        + super::FromBytes
+        + super::SliceAsBytes
+        + PartialOrd
+    {
+        /// Return the most primitive version of encoding a given type
+        fn encoded(&self) -> EncodedValue<'_>;
+
+        /// Return the encoded size for a type
+        fn dict_encoding_size(&self) -> (usize, usize) {
+            (std::mem::size_of::<Self>(), 1)
+        }
+
+        /// Return the value as i64 if possible
+        ///
+        /// This is essentially the same as `std::convert::TryInto<i64>` but can
+        /// implemented for `f32` and `f64`, types that would fail orphan rules
+        fn as_i64(&self) -> Result<i64> {
+            Err(general_err!("Type cannot be converted to i64"))
+        }
+
+        /// Return the value as u64 if possible
+        ///
+        /// This is essentially the same as `std::convert::TryInto<u64>` but can
+        /// implemented for `f32` and `f64`, types that would fail orphan rules
+        fn as_u64(&self) -> Result<u64> {
+            self.as_i64()
+                .map_err(|_| general_err!("Type cannot be converted to u64"))
+                .map(|x| x as u64)
+        }
+
+        /// Return the value as an Any to allow for downcasts without transmutation
+        fn as_any(&self) -> &dyn std::any::Any;
+
+        /// Return the value as an mutable Any to allow for downcasts without transmutation
+        fn as_mut_any(&mut self) -> &mut dyn std::any::Any;
+    }
+
+    impl ParquetValueType for bool {
+        fn encoded(&self) -> EncodedValue<'_> {
+            EncodedValue::Bits { index: *self as u64 }
+        }
+
+        fn as_i64(&self) -> Result<i64> {
+            Ok(*self as i64)
+        }
+
+        fn as_any(&self) -> &dyn std::any::Any {
+            self
+        }
+
+        fn as_mut_any(&mut self) -> &mut dyn std::any::Any {
+            self
+        }
+    }
+
+    /// Hopelessly unsafe function that emulates `num::as_ne_bytes`
+    ///
+    /// It is not recommended to use this outside of this private module as, while it
+    /// _should_ work for primitive values, it is little better than a transmutation
+    /// and can act as a backdoor into mis-interpreting types as arbitary byte slices
+    fn as_raw<'a, T>(value: *const T) -> &'a [u8] {
+        unsafe {
+            let value = value as *const u8;
+            std::slice::from_raw_parts(value, std::mem::size_of::<T>())
+        }
+    }
+
+    macro_rules! impl_from_raw {
+        ($ty: ty, $self: ident => $as_i64: block) => {
+            impl ParquetValueType for $ty {
+                fn encoded(&self) -> EncodedValue<'_> {
+                    EncodedValue::Bytes { data: as_raw(&*self) }
+                }
+
+                fn as_i64(&$self) -> Result<i64> {
+                    $as_i64
+                }
+
+                fn as_any(&self) -> &dyn std::any::Any {
+                    self
+                }
+
+                fn as_mut_any(&mut self) -> &mut dyn std::any::Any {
+                    self
+                }
+            }
+        }
+    }
+
+    impl_from_raw!(i32, self => { Ok(*self as i64) });
+    impl_from_raw!(i64, self => { Ok(*self) });
+    impl_from_raw!(f32, self => { Err(general_err!("Type cannot be converted to i64")) });
+    impl_from_raw!(f64, self => { Err(general_err!("Type cannot be converted to i64")) });

Review comment:
       ```suggestion
       impl_from_raw!(f32, self => { Err(general_err!("Type f32 cannot be converted to i64")) });
       impl_from_raw!(f64, self => { Err(general_err!("Type f64 cannot be converted to i64")) });
   ```

##########
File path: rust/rust-toolchain
##########
@@ -1 +0,0 @@
-nightly-2020-11-14

Review comment:
       🎉 

##########
File path: rust/parquet/src/data_type.rs
##########
@@ -424,16 +460,165 @@ impl AsBytes for str {
     }
 }
 
-/// Contains the Parquet physical type information as well as the Rust primitive type
-/// presentation.
-pub trait DataType: 'static {
-    type T: std::cmp::PartialEq
+pub(crate) mod private {
+    use super::{AsBytes, Result, ParquetError};
+
+    pub type BitIndex = u64;
+
+    #[derive(Copy, Clone)]
+    pub enum EncodedValue<'a> {
+        /// The value can be encoded from the following bytes
+        Bytes {
+            data: &'a [u8]
+        },
+        /// The value encodes as a specific set bit at a given index

Review comment:
       ```suggestion
           /// The value encoded as a specific set bit at a given index
   ```




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
users@infra.apache.org