You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/04/01 09:45:45 UTC
[GitHub] [arrow-rs] jhorstmann commented on a change in pull request #1499: Alternative implementation of nullif kernel by slicing nested buffers

jhorstmann commented on a change in pull request #1499:
URL: https://github.com/apache/arrow-rs/pull/1499#discussion_r840416500



##########
File path: arrow/src/compute/kernels/boolean.rs
##########
@@ -575,10 +577,241 @@ where
     Ok(PrimitiveArray::<T>::from(data))
 }
 
+/// Creates a (mostly) zero-copy slice of the given buffers so that they can be combined
+/// in the same array with other buffers that start at offset 0.
+/// The only buffers that need an actual copy are booleans and validity (if they are not byte-aligned).
+/// This is useful when a kernel calculates a new validity bitmap but wants to reuse other buffers.
+fn slice_buffers(
+    buffers: &[Buffer],
+    offset: usize,
+    len: usize,
+    data_type: &DataType,
+    child_data: &[ArrayData],
+) -> (Vec<Buffer>, Vec<ArrayData>) {
+    use std::mem::size_of;
+
+    if offset == 0 {
+        return (buffers.to_vec(), child_data.to_vec());
+    }
+
+    // we only need to do something special to child data in 2 of the match branches
+    let mut result_child_data = None;
+
+    let result_buffers = match data_type {
+        DataType::Boolean => vec![buffers[0].bit_slice(offset, len)],
+        DataType::Int8 | DataType::UInt8 => {
+            vec![buffers[0].slice(offset * size_of::<u8>())]
+        }
+        DataType::Int16 | DataType::UInt16 | DataType::Float16 => {
+            vec![buffers[0].slice(offset * size_of::<u16>())]
+        }
+        DataType::Int32 | DataType::UInt32 | DataType::Float32 => {
+            vec![buffers[0].slice(offset * size_of::<u32>())]
+        }
+        DataType::Int64 | DataType::UInt64 | DataType::Float64 => {
+            vec![buffers[0].slice(offset * size_of::<u64>())]
+        }
+        DataType::Timestamp(_, _) | DataType::Duration(_) => {
+            vec![buffers[0].slice(offset * size_of::<u64>())]
+        }
+        DataType::Date32 | DataType::Time32(_) => {
+            vec![buffers[0].slice(offset * size_of::<u32>())]
+        }
+        DataType::Date64 | DataType::Time64(_) => {
+            vec![buffers[0].slice(offset * size_of::<u64>())]
+        }
+        DataType::Interval(IntervalUnit::YearMonth) => {
+            vec![buffers[0].slice(offset * size_of::<u32>())]
+        }
+        DataType::Interval(IntervalUnit::DayTime) => {
+            vec![buffers[0].slice(offset * 2 * size_of::<u32>())]
+        }
+        DataType::Interval(IntervalUnit::MonthDayNano) => {
+            vec![buffers[0].slice(offset * (2 * size_of::<u32>() + size_of::<u64>()))]
+        }
+        DataType::Decimal(_, _) => vec![buffers[0].slice(offset * size_of::<i128>())],
+        DataType::Dictionary(key_type, _) => match key_type.as_ref() {
+            DataType::Int8 | DataType::UInt8 => {
+                vec![buffers[0].slice(offset * size_of::<u8>())]
+            }
+            DataType::Int16 | DataType::UInt16 => {
+                vec![buffers[0].slice(offset * size_of::<u16>())]
+            }
+            DataType::Int32 | DataType::UInt32 => {
+                vec![buffers[0].slice(offset * size_of::<u32>())]
+            }
+            DataType::Int64 | DataType::UInt64 => {
+                vec![buffers[0].slice(offset * size_of::<u64>())]
+            }
+            _ => unreachable!(),
+        },
+        DataType::List(_) => {
+            // safe because for List the first buffer is guaranteed to contain i32 offsets
+            vec![buffers[0].slice(offset * size_of::<i32>())]
+        }
+        DataType::LargeList(_) => {
+            // safe because for LargeList the first buffer is guaranteed to contain i64 offsets
+            vec![buffers[0].slice(offset * size_of::<i64>())]
+        }
+        DataType::Binary | DataType::Utf8 => {
+            // safe because for Binary/Utf8 the first buffer is guaranteed to contain i32 offsets
+            vec![
+                buffers[0].slice(offset * size_of::<i32>()),
+                buffers[1].clone(),
+            ]
+        }
+        DataType::LargeBinary | DataType::LargeUtf8 => {
+            // safe because for LargeBinary/LargeUtf8 the first buffer is guaranteed to contain i64 offsets
+            vec![
+                buffers[0].slice(offset * size_of::<i64>()),
+                buffers[1].clone(),
+            ]
+        }
+        DataType::FixedSizeBinary(size) => {
+            vec![buffers[0].slice(offset * (*size as usize))]
+        }
+        DataType::FixedSizeList(_field, size) => {
+            result_child_data = Some(
+                child_data
+                    .iter()
+                    .map(|d| {
+                        slice_array_data(
+                            d,
+                            offset * (*size as usize),
+                            len * (*size as usize),
+                        )
+                    })
+                    .collect(),
+            );
+            vec![]
+        }
+        DataType::Struct(_) => {
+            result_child_data = Some(
+                child_data
+                    .iter()
+                    .map(|d| slice_array_data(d, offset, len))
+                    .collect(),
+            );
+            vec![]
+        }
+        DataType::Union(..) => {
+            // TODO: verify this is actually correct
+            if buffers.len() > 1 {
+                // dense union, type ids and offsets
+                vec![
+                    buffers[0].slice(offset * size_of::<i8>()),
+                    buffers[1].slice(offset * size_of::<u32>()),
+                ]
+            } else {
+                // sparse union with only type ids
+                // should this also slice the child arrays?
+                vec![buffers[0].slice(offset * size_of::<i8>())]
+            }
+        }
+        DataType::Map(_, _) => {
+            // TODO: verify this is actually correct

Review comment:
       This is probably not correct since Map has offsets like a list, only the offsets buffer should need to be sliced.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org