You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/04/19 12:41:51 UTC

[GitHub] [arrow-datafusion] alamb commented on a diff in pull request #2261: Introduce RowLayout to represent rows for different purposes

alamb commented on code in PR #2261:
URL: https://github.com/apache/arrow-datafusion/pull/2261#discussion_r853006965


##########
datafusion/core/src/row/layout.rs:
##########
@@ -17,26 +17,94 @@
 
 //! Various row layout for different use case
 
-use crate::row::{schema_null_free, var_length};
+use crate::row::{row_supported, schema_null_free, var_length};
 use arrow::datatypes::{DataType, Schema};
 use arrow::util::bit_util::{ceil, round_upto_power_of_2};
+use std::fmt::{Debug, Formatter};
 use std::sync::Arc;
 
 const UTF8_DEFAULT_SIZE: usize = 20;
 const BINARY_DEFAULT_SIZE: usize = 100;
 
+#[derive(Copy, Clone, Debug)]
+/// Type of a RowLayout
+pub enum RowType {

Review Comment:
   👍 



##########
datafusion/core/src/row/layout.rs:
##########
@@ -17,26 +17,94 @@
 
 //! Various row layout for different use case
 
-use crate::row::{schema_null_free, var_length};
+use crate::row::{row_supported, schema_null_free, var_length};
 use arrow::datatypes::{DataType, Schema};
 use arrow::util::bit_util::{ceil, round_upto_power_of_2};
+use std::fmt::{Debug, Formatter};
 use std::sync::Arc;
 
 const UTF8_DEFAULT_SIZE: usize = 20;
 const BINARY_DEFAULT_SIZE: usize = 100;
 
+#[derive(Copy, Clone, Debug)]
+/// Type of a RowLayout
+pub enum RowType {
+    /// This type of layout will store each field with minimum bytes for space efficiency.
+    /// Its typical use case represents a sorting payload that accesses all row fields as a unit.
+    Compact,
+    /// This type of layout will store one 8-byte word per field for CPU-friendly,
+    /// It is mainly used to represent the rows with frequently updated content,
+    /// for example, grouping state for hash aggregation.
+    WordAligned,
+    // RawComparable,
+}
+
+/// Reveals how the fields of a record are stored in the raw-bytes format
+pub(crate) struct RowLayout {
+    /// Type of the layout
+    type_: RowType,
+    /// If a row is null free according to its schema
+    pub(crate) null_free: bool,
+    /// The number of bytes used to store null bits for each field.
+    pub(crate) null_width: usize,
+    /// Length in bytes for `values` part of the current tuple.
+    pub(crate) values_width: usize,
+    /// Total number of fields for each tuple.
+    pub(crate) field_count: usize,
+    /// Starting offset for each fields in the raw bytes.
+    pub(crate) field_offsets: Vec<usize>,
+}
+
+impl RowLayout {
+    pub(crate) fn new(schema: &Arc<Schema>, type_: RowType) -> Self {

Review Comment:
   ```suggestion
       pub(crate) fn new(schema: &Schema, type_: RowType) -> Self {
   ```



##########
datafusion/core/src/row/layout.rs:
##########
@@ -17,26 +17,94 @@
 
 //! Various row layout for different use case
 
-use crate::row::{schema_null_free, var_length};
+use crate::row::{row_supported, schema_null_free, var_length};
 use arrow::datatypes::{DataType, Schema};
 use arrow::util::bit_util::{ceil, round_upto_power_of_2};
+use std::fmt::{Debug, Formatter};
 use std::sync::Arc;
 
 const UTF8_DEFAULT_SIZE: usize = 20;
 const BINARY_DEFAULT_SIZE: usize = 100;
 
+#[derive(Copy, Clone, Debug)]
+/// Type of a RowLayout
+pub enum RowType {
+    /// This type of layout will store each field with minimum bytes for space efficiency.
+    /// Its typical use case represents a sorting payload that accesses all row fields as a unit.
+    Compact,
+    /// This type of layout will store one 8-byte word per field for CPU-friendly,
+    /// It is mainly used to represent the rows with frequently updated content,
+    /// for example, grouping state for hash aggregation.
+    WordAligned,
+    // RawComparable,
+}
+
+/// Reveals how the fields of a record are stored in the raw-bytes format
+pub(crate) struct RowLayout {
+    /// Type of the layout
+    type_: RowType,
+    /// If a row is null free according to its schema
+    pub(crate) null_free: bool,
+    /// The number of bytes used to store null bits for each field.
+    pub(crate) null_width: usize,
+    /// Length in bytes for `values` part of the current tuple.
+    pub(crate) values_width: usize,
+    /// Total number of fields for each tuple.
+    pub(crate) field_count: usize,
+    /// Starting offset for each fields in the raw bytes.
+    pub(crate) field_offsets: Vec<usize>,
+}
+
+impl RowLayout {
+    pub(crate) fn new(schema: &Arc<Schema>, type_: RowType) -> Self {
+        assert!(row_supported(schema));
+        let null_free = schema_null_free(schema);
+        let field_count = schema.fields().len();
+        let null_width = if null_free { 0 } else { ceil(field_count, 8) };
+        let (field_offsets, values_width) = match type_ {
+            RowType::Compact => compact_offsets(null_width, schema),
+            RowType::WordAligned => word_aligned_offsets(null_width, schema),
+        };
+        Self {
+            type_,
+            null_free,
+            null_width,
+            values_width,
+            field_count,
+            field_offsets,
+        }
+    }
+
+    #[inline(always)]
+    pub(crate) fn init_varlena_offset(&self) -> usize {
+        self.null_width + self.values_width
+    }
+}
+
+impl Debug for RowLayout {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("RowLayout")
+            .field("type", &self.type_)
+            .field("null_width", &self.null_width)
+            .field("values_width", &self.values_width)
+            .field("field_count", &self.field_count)
+            .field("offsets", &self.field_offsets)
+            .finish()
+    }
+}
+
 /// Get relative offsets for each field and total width for values
-pub fn get_offsets(null_width: usize, schema: &Arc<Schema>) -> (Vec<usize>, usize) {
+fn compact_offsets(null_width: usize, schema: &Arc<Schema>) -> (Vec<usize>, usize) {

Review Comment:
   ```suggestion
   fn compact_offsets(null_width: usize, schema: &Schema) -> (Vec<usize>, usize) {
   ```



##########
datafusion/core/src/row/layout.rs:
##########
@@ -17,26 +17,94 @@
 
 //! Various row layout for different use case
 
-use crate::row::{schema_null_free, var_length};
+use crate::row::{row_supported, schema_null_free, var_length};
 use arrow::datatypes::{DataType, Schema};
 use arrow::util::bit_util::{ceil, round_upto_power_of_2};
+use std::fmt::{Debug, Formatter};
 use std::sync::Arc;
 
 const UTF8_DEFAULT_SIZE: usize = 20;
 const BINARY_DEFAULT_SIZE: usize = 100;
 
+#[derive(Copy, Clone, Debug)]
+/// Type of a RowLayout
+pub enum RowType {
+    /// This type of layout will store each field with minimum bytes for space efficiency.
+    /// Its typical use case represents a sorting payload that accesses all row fields as a unit.
+    Compact,
+    /// This type of layout will store one 8-byte word per field for CPU-friendly,
+    /// It is mainly used to represent the rows with frequently updated content,
+    /// for example, grouping state for hash aggregation.
+    WordAligned,
+    // RawComparable,
+}
+
+/// Reveals how the fields of a record are stored in the raw-bytes format
+pub(crate) struct RowLayout {
+    /// Type of the layout
+    type_: RowType,
+    /// If a row is null free according to its schema
+    pub(crate) null_free: bool,
+    /// The number of bytes used to store null bits for each field.
+    pub(crate) null_width: usize,
+    /// Length in bytes for `values` part of the current tuple.
+    pub(crate) values_width: usize,
+    /// Total number of fields for each tuple.
+    pub(crate) field_count: usize,
+    /// Starting offset for each fields in the raw bytes.
+    pub(crate) field_offsets: Vec<usize>,
+}
+
+impl RowLayout {
+    pub(crate) fn new(schema: &Arc<Schema>, type_: RowType) -> Self {
+        assert!(row_supported(schema));
+        let null_free = schema_null_free(schema);
+        let field_count = schema.fields().len();
+        let null_width = if null_free { 0 } else { ceil(field_count, 8) };
+        let (field_offsets, values_width) = match type_ {
+            RowType::Compact => compact_offsets(null_width, schema),
+            RowType::WordAligned => word_aligned_offsets(null_width, schema),
+        };
+        Self {
+            type_,
+            null_free,
+            null_width,
+            values_width,
+            field_count,
+            field_offsets,
+        }
+    }
+
+    #[inline(always)]
+    pub(crate) fn init_varlena_offset(&self) -> usize {
+        self.null_width + self.values_width
+    }
+}
+
+impl Debug for RowLayout {

Review Comment:
   I wonder if there is a reason you chose not to use `#[derive(Debug)]` here rather than a manual implementation
   



##########
datafusion/core/src/row/reader.rs:
##########
@@ -85,52 +85,36 @@ macro_rules! fn_get_idx_opt {
 
 /// Read the tuple `data[base_offset..]` we are currently pointing to
 pub struct RowReader<'a> {
+    /// Layout on how to read each field

Review Comment:
   👍  this is nice



##########
datafusion/core/src/row/layout.rs:
##########
@@ -50,13 +118,23 @@ fn type_width(dt: &DataType) -> usize {
     }
 }
 
+fn word_aligned_offsets(null_width: usize, schema: &Arc<Schema>) -> (Vec<usize>, usize) {
+    let mut offsets = vec![];
+    let mut offset = null_width;
+    for _ in schema.fields() {
+        offsets.push(offset);
+        offset += 8; // a 8-bytes word for each field

Review Comment:
   Are we sure that all field types can fit within 8 bytes?  I am thinking about types like `DataType::Decimal` stored in a `u128` / `16 bytes`)?
   
   Also it would be nice to know variable length data works here



##########
datafusion/core/src/row/layout.rs:
##########
@@ -17,26 +17,94 @@
 
 //! Various row layout for different use case
 
-use crate::row::{schema_null_free, var_length};
+use crate::row::{row_supported, schema_null_free, var_length};
 use arrow::datatypes::{DataType, Schema};
 use arrow::util::bit_util::{ceil, round_upto_power_of_2};
+use std::fmt::{Debug, Formatter};
 use std::sync::Arc;
 
 const UTF8_DEFAULT_SIZE: usize = 20;
 const BINARY_DEFAULT_SIZE: usize = 100;
 
+#[derive(Copy, Clone, Debug)]
+/// Type of a RowLayout
+pub enum RowType {
+    /// This type of layout will store each field with minimum bytes for space efficiency.
+    /// Its typical use case represents a sorting payload that accesses all row fields as a unit.
+    Compact,
+    /// This type of layout will store one 8-byte word per field for CPU-friendly,
+    /// It is mainly used to represent the rows with frequently updated content,
+    /// for example, grouping state for hash aggregation.
+    WordAligned,
+    // RawComparable,
+}
+
+/// Reveals how the fields of a record are stored in the raw-bytes format
+pub(crate) struct RowLayout {
+    /// Type of the layout
+    type_: RowType,
+    /// If a row is null free according to its schema
+    pub(crate) null_free: bool,
+    /// The number of bytes used to store null bits for each field.
+    pub(crate) null_width: usize,
+    /// Length in bytes for `values` part of the current tuple.
+    pub(crate) values_width: usize,
+    /// Total number of fields for each tuple.
+    pub(crate) field_count: usize,
+    /// Starting offset for each fields in the raw bytes.
+    pub(crate) field_offsets: Vec<usize>,
+}
+
+impl RowLayout {
+    pub(crate) fn new(schema: &Arc<Schema>, type_: RowType) -> Self {

Review Comment:
   Minor stylistic comment



##########
datafusion/core/src/row/writer.rs:
##########
@@ -96,51 +96,34 @@ macro_rules! fn_set_idx {
 
 /// Reusable row writer backed by Vec<u8>
 pub struct RowWriter {
+    /// Layout on how to write each field

Review Comment:
   ❤️  for reduced repetition



##########
datafusion/core/src/row/mod.rs:
##########
@@ -176,8 +176,8 @@ mod tests {
                     let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(a)])?;
                     let mut vector = vec![0; 1024];
                     let row_offsets =

Review Comment:
   Would it be possible to extend these tests to include the new `WordAligned` format as well?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org