You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "tustvold (via GitHub)" <gi...@apache.org> on 2023/02/09 13:23:32 UTC

[GitHub] [arrow-rs] tustvold commented on a diff in pull request #3662: feat + fix: IPC support for run encoded array.

tustvold commented on code in PR #3662:
URL: https://github.com/apache/arrow-rs/pull/3662#discussion_r1101398368


##########
arrow-array/src/array/run_array.rs:
##########
@@ -144,15 +144,16 @@ impl<R: RunEndIndexType> RunArray<R> {
         })
     }
 
-    /// Returns index to the physical array for the given index to the logical array.
-    /// Performs a binary search on the run_ends array for the input index.
     #[inline]
-    pub fn get_physical_index(&self, logical_index: usize) -> Option<usize> {
-        if logical_index >= self.len() {
+    fn get_physical_index_from_run_ends_array(
+        run_ends: &PrimitiveArray<R>,

Review Comment:
   This argument only ever appears to be given `self.run_ends`? Is it necessary?



##########
arrow-array/src/array/run_array.rs:
##########
@@ -233,6 +275,63 @@ impl<R: RunEndIndexType> RunArray<R> {
         }
         Ok(physical_indices)
     }
+
+    /// Returns a `RunArray` with zero offset and length matching the last value
+    /// in run_ends array.
+    pub fn into_non_sliced_array(self) -> Result<Self, ArrowError> {

Review Comment:
   I think I would prefer this live alongside the IPC implementation, as it is effectively an optimisation for the writer



##########
arrow-array/src/array/run_array.rs:
##########
@@ -233,6 +275,63 @@ impl<R: RunEndIndexType> RunArray<R> {
         }
         Ok(physical_indices)
     }
+
+    /// Returns a `RunArray` with zero offset and length matching the last value
+    /// in run_ends array.
+    pub fn into_non_sliced_array(self) -> Result<Self, ArrowError> {
+        if self.data.offset() == 0 && self.data.len() == Self::logical_len(&self.run_ends)
+        {
+            return Ok(self);
+        }
+        // The physical index of original run_ends array from which the `ArrayData`is sliced.
+        let start_physical_index = Self::get_physical_index_from_run_ends_array(
+            &self.run_ends,
+            self.data.offset(),
+        )
+        .ok_or_else(|| {
+            ArrowError::InvalidArgumentError(format!(
+                "Cannot convert the offset {} to physical index",
+                self.data.offset()
+            ))
+        })?;
+
+        // The logical length of original run_ends array until which the `ArrayData` is sliced.
+        let end_logical_index = self.data.offset() + self.data.len() - 1;
+        // The physical index of original run_ends array until which the `ArrayData`is sliced.
+        let end_physical_index =
+            Self::get_physical_index_from_run_ends_array(&self.run_ends, end_logical_index).ok_or_else(|| {
+                ArrowError::InvalidArgumentError(format!(
+                    "Cannot convert the `offset + len - 1` {end_logical_index} to physical index"
+                ))
+            })?;
+
+        let physical_length = end_physical_index - start_physical_index + 1;
+
+        // build new run_ends array by subtrating offset from run ends.
+        let new_run_ends: PrimitiveArray<R> = self
+            .run_ends
+            .values()
+            .iter()
+            .skip(start_physical_index)
+            .take(physical_length)
+            .map(|f| f.as_usize() - self.data.offset())
+            .map(|f| f.min(self.len()))
+            .map(R::Native::from_usize)
+            .collect();
+
+        // build new values by slicing physical indices.
+        let new_values = self
+            .values
+            .slice(start_physical_index, physical_length)
+            .into_data();
+
+        let builder = ArrayDataBuilder::new(self.data_type().clone())
+            .len(self.len())
+            .add_child_data(new_run_ends.into_data())
+            .add_child_data(new_values);
+        let array_data = builder.build()?;

Review Comment:
   This should probably use `build_unchecked` to avoid doing the validation again



##########
arrow-ipc/src/writer.rs:
##########
@@ -1242,24 +1289,45 @@ fn write_array_data(
         }
     }
 
-    if !matches!(array_data.data_type(), DataType::Dictionary(_, _)) {
-        // recursively write out nested structures
-        for data_ref in array_data.child_data() {
-            // write the nested data (e.g list data)
-            offset = write_array_data(
-                data_ref,
-                buffers,
-                arrow_data,
-                nodes,
-                offset,
-                data_ref.len(),
-                data_ref.null_count(),
-                compression_codec,
-                write_options,
-            )?;
+    match array_data.data_type() {
+        DataType::Dictionary(_, _) => {}
+        DataType::RunEndEncoded(_, _) => {
+            // unslice the run encoded array.
+            let arr = unslice_run_array(array_data.clone())?;

Review Comment:
   I can't see where we alter the offset that gets written for the `RunEndEncoded` array to be 0?



##########
arrow-array/src/array/run_array.rs:
##########
@@ -233,6 +275,63 @@ impl<R: RunEndIndexType> RunArray<R> {
         }
         Ok(physical_indices)
     }
+
+    /// Returns a `RunArray` with zero offset and length matching the last value
+    /// in run_ends array.
+    pub fn into_non_sliced_array(self) -> Result<Self, ArrowError> {
+        if self.data.offset() == 0 && self.data.len() == Self::logical_len(&self.run_ends)
+        {
+            return Ok(self);
+        }
+        // The physical index of original run_ends array from which the `ArrayData`is sliced.
+        let start_physical_index = Self::get_physical_index_from_run_ends_array(
+            &self.run_ends,
+            self.data.offset(),
+        )
+        .ok_or_else(|| {
+            ArrowError::InvalidArgumentError(format!(
+                "Cannot convert the offset {} to physical index",
+                self.data.offset()
+            ))
+        })?;
+
+        // The logical length of original run_ends array until which the `ArrayData` is sliced.
+        let end_logical_index = self.data.offset() + self.data.len() - 1;
+        // The physical index of original run_ends array until which the `ArrayData`is sliced.
+        let end_physical_index =
+            Self::get_physical_index_from_run_ends_array(&self.run_ends, end_logical_index).ok_or_else(|| {
+                ArrowError::InvalidArgumentError(format!(
+                    "Cannot convert the `offset + len - 1` {end_logical_index} to physical index"
+                ))
+            })?;
+
+        let physical_length = end_physical_index - start_physical_index + 1;
+
+        // build new run_ends array by subtrating offset from run ends.
+        let new_run_ends: PrimitiveArray<R> = self
+            .run_ends
+            .values()
+            .iter()
+            .skip(start_physical_index)
+            .take(physical_length)
+            .map(|f| f.as_usize() - self.data.offset())
+            .map(|f| f.min(self.len()))
+            .map(R::Native::from_usize)

Review Comment:
   I think this will convert overflow to `None` which is not correct



##########
arrow-data/src/equal/run.rs:
##########
@@ -0,0 +1,94 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::data::ArrayData;
+
+use super::equal_range;
+
+/// The current implementation of comparison of run array support partial comparison.
+/// Comparing run encoded array based on logical indices (`lhs_start`, `rhs_start`) will
+/// be time consuming as converting from logical index to physical index cannot be done
+/// in constat time. The current comparison compares the underlying physical arrays.

Review Comment:
   ```suggestion
   /// in constant time. The current comparison compares the underlying physical arrays.
   ```



##########
arrow-data/src/equal/run.rs:
##########
@@ -0,0 +1,94 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::data::ArrayData;
+
+use super::equal_range;
+
+/// The current implementation of comparison of run array support partial comparison.
+/// Comparing run encoded array based on logical indices (`lhs_start`, `rhs_start`) will
+/// be time consuming as converting from logical index to physical index cannot be done
+/// in constat time. The current comparison compares the underlying physical arrays.
+pub(super) fn run_equal(
+    lhs: &ArrayData,
+    rhs: &ArrayData,
+    lhs_start: usize,
+    rhs_start: usize,
+    len: usize,
+) -> bool {
+    if lhs_start != 0
+        || rhs_start != 0
+        || (lhs.len() != len && rhs.len() != len)
+        || lhs.offset() > 0
+        || rhs.offset() > 0
+    {
+        unimplemented!("Partial comparison for run array not supported.")

Review Comment:
   ```suggestion
           unimplemented!("Logical comparison for run array not supported.")
   ```



##########
arrow-data/src/data.rs:
##########
@@ -1514,9 +1514,10 @@ impl ArrayData {
             Ok(())
         })?;
 
-        if prev_value.as_usize() != array_len {
+        if prev_value.as_usize() < (self.offset + self.len) {

Review Comment:
   :+1:



##########
arrow-array/src/array/run_array.rs:
##########
@@ -204,12 +222,36 @@ impl<R: RunEndIndexType> RunArray<R> {
                 .unwrap()
         });
 
+        // Return early if all the logical indices cannot be converted to physical indices.
+        let largest_logical_index =

Review Comment:
   Does this remove the need for the later check https://github.com/apache/arrow-rs/pull/3662/files#diff-5a6a759d325c1c9512125fcb8fc9dad89242f61e9fca5cffaa9297387cf94d07R269



##########
arrow-ipc/regen.sh:
##########
@@ -18,15 +18,13 @@
 
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 
-# Change to the toplevel Rust directory
-pushd $DIR/../../
+# Change to the toplevel `arrow-rs` directory

Review Comment:
   Thank you for updating this script, I've just been doing it manually :sweat_smile: 



##########
arrow-array/src/array/run_array.rs:
##########
@@ -552,6 +651,33 @@ mod tests {
         result
     }
 
+    fn compare_logical_and_physical_indices(

Review Comment:
   ```suggestion
       /// Asserts that `logical_array[logical_indices[*]] == physical_array[physical_indices[*]]`
       fn compare_logical_and_physical_indices(
   ```



##########
arrow-ipc/src/writer.rs:
##########
@@ -218,6 +218,30 @@ impl IpcDataGenerator {
                     )?;
                 }
             }
+            DataType::RunEndEncoded(run_ends, values) => {
+                if column.data().child_data().len() != 2 {
+                    return Err(ArrowError::InvalidArgumentError(format!(
+                        "The run encoded array should have exactly two child arrays. Found {}",
+                        column.data().child_data().len()
+                    )));
+                }
+                let run_ends_array = make_array(column.data().child_data()[0].clone());
+                let values_array = make_array(column.data().child_data()[1].clone());
+                self.encode_dictionaries(

Review Comment:
   I don't think the run ends can be dictionary encoded, so I think we could skip this for the first child



##########
arrow-array/src/array/run_array.rs:
##########
@@ -233,6 +275,63 @@ impl<R: RunEndIndexType> RunArray<R> {
         }
         Ok(physical_indices)
     }
+
+    /// Returns a `RunArray` with zero offset and length matching the last value
+    /// in run_ends array.
+    pub fn into_non_sliced_array(self) -> Result<Self, ArrowError> {
+        if self.data.offset() == 0 && self.data.len() == Self::logical_len(&self.run_ends)
+        {
+            return Ok(self);
+        }
+        // The physical index of original run_ends array from which the `ArrayData`is sliced.
+        let start_physical_index = Self::get_physical_index_from_run_ends_array(
+            &self.run_ends,
+            self.data.offset(),
+        )
+        .ok_or_else(|| {

Review Comment:
   FWIW these could panic, as the ArrayData should be valid



##########
arrow-data/src/equal/run.rs:
##########
@@ -0,0 +1,94 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::data::ArrayData;
+
+use super::equal_range;
+
+/// The current implementation of comparison of run array support partial comparison.
+/// Comparing run encoded array based on logical indices (`lhs_start`, `rhs_start`) will
+/// be time consuming as converting from logical index to physical index cannot be done

Review Comment:
   Perhaps we could iterate over the physical indices in each array, and then use equal_range to compare the underlying values, yes this will be slow, but we do something similar for other nested types and it hasn't caused issues



##########
arrow-array/src/array/run_array.rs:
##########
@@ -233,6 +275,63 @@ impl<R: RunEndIndexType> RunArray<R> {
         }
         Ok(physical_indices)
     }
+
+    /// Returns a `RunArray` with zero offset and length matching the last value
+    /// in run_ends array.
+    pub fn into_non_sliced_array(self) -> Result<Self, ArrowError> {
+        if self.data.offset() == 0 && self.data.len() == Self::logical_len(&self.run_ends)
+        {
+            return Ok(self);
+        }
+        // The physical index of original run_ends array from which the `ArrayData`is sliced.
+        let start_physical_index = Self::get_physical_index_from_run_ends_array(
+            &self.run_ends,
+            self.data.offset(),
+        )
+        .ok_or_else(|| {
+            ArrowError::InvalidArgumentError(format!(
+                "Cannot convert the offset {} to physical index",
+                self.data.offset()
+            ))
+        })?;
+
+        // The logical length of original run_ends array until which the `ArrayData` is sliced.
+        let end_logical_index = self.data.offset() + self.data.len() - 1;
+        // The physical index of original run_ends array until which the `ArrayData`is sliced.
+        let end_physical_index =
+            Self::get_physical_index_from_run_ends_array(&self.run_ends, end_logical_index).ok_or_else(|| {
+                ArrowError::InvalidArgumentError(format!(
+                    "Cannot convert the `offset + len - 1` {end_logical_index} to physical index"
+                ))
+            })?;
+
+        let physical_length = end_physical_index - start_physical_index + 1;
+
+        // build new run_ends array by subtrating offset from run ends.
+        let new_run_ends: PrimitiveArray<R> = self
+            .run_ends
+            .values()
+            .iter()
+            .skip(start_physical_index)
+            .take(physical_length)
+            .map(|f| f.as_usize() - self.data.offset())
+            .map(|f| f.min(self.len()))

Review Comment:
   If I'm not mistaken this is to handle the final offset, perhaps we could special case this and use `PrimitiveBuilder`



##########
arrow-array/src/array/run_array.rs:
##########
@@ -233,6 +275,63 @@ impl<R: RunEndIndexType> RunArray<R> {
         }
         Ok(physical_indices)
     }
+
+    /// Returns a `RunArray` with zero offset and length matching the last value
+    /// in run_ends array.
+    pub fn into_non_sliced_array(self) -> Result<Self, ArrowError> {
+        if self.data.offset() == 0 && self.data.len() == Self::logical_len(&self.run_ends)
+        {
+            return Ok(self);
+        }
+        // The physical index of original run_ends array from which the `ArrayData`is sliced.
+        let start_physical_index = Self::get_physical_index_from_run_ends_array(
+            &self.run_ends,
+            self.data.offset(),
+        )
+        .ok_or_else(|| {
+            ArrowError::InvalidArgumentError(format!(
+                "Cannot convert the offset {} to physical index",
+                self.data.offset()
+            ))
+        })?;
+
+        // The logical length of original run_ends array until which the `ArrayData` is sliced.
+        let end_logical_index = self.data.offset() + self.data.len() - 1;
+        // The physical index of original run_ends array until which the `ArrayData`is sliced.
+        let end_physical_index =
+            Self::get_physical_index_from_run_ends_array(&self.run_ends, end_logical_index).ok_or_else(|| {
+                ArrowError::InvalidArgumentError(format!(
+                    "Cannot convert the `offset + len - 1` {end_logical_index} to physical index"
+                ))
+            })?;
+
+        let physical_length = end_physical_index - start_physical_index + 1;
+
+        // build new run_ends array by subtrating offset from run ends.
+        let new_run_ends: PrimitiveArray<R> = self
+            .run_ends
+            .values()
+            .iter()
+            .skip(start_physical_index)
+            .take(physical_length)
+            .map(|f| f.as_usize() - self.data.offset())
+            .map(|f| f.min(self.len()))
+            .map(R::Native::from_usize)
+            .collect();

Review Comment:
   FWIW using `BufferBuilder` directly will be significantly faster, as it doesn't need to handle nulls, LLVM is not that smart at optimising iterators



##########
arrow-data/src/equal/run.rs:
##########
@@ -0,0 +1,94 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::data::ArrayData;
+
+use super::equal_range;
+
+/// The current implementation of comparison of run array support partial comparison.
+/// Comparing run encoded array based on logical indices (`lhs_start`, `rhs_start`) will
+/// be time consuming as converting from logical index to physical index cannot be done
+/// in constat time. The current comparison compares the underlying physical arrays.
+pub(super) fn run_equal(
+    lhs: &ArrayData,
+    rhs: &ArrayData,
+    lhs_start: usize,
+    rhs_start: usize,
+    len: usize,
+) -> bool {
+    if lhs_start != 0
+        || rhs_start != 0
+        || (lhs.len() != len && rhs.len() != len)
+        || lhs.offset() > 0
+        || rhs.offset() > 0
+    {
+        unimplemented!("Partial comparison for run array not supported.")

Review Comment:
   FWIW I think it would be really good to support logical comparison, even if the first version is slow, as this is consistent with the other arrays



##########
arrow-array/src/array/run_array.rs:
##########
@@ -233,6 +275,63 @@ impl<R: RunEndIndexType> RunArray<R> {
         }
         Ok(physical_indices)
     }
+
+    /// Returns a `RunArray` with zero offset and length matching the last value
+    /// in run_ends array.
+    pub fn into_non_sliced_array(self) -> Result<Self, ArrowError> {

Review Comment:
   ```suggestion
       pub fn materialize_array_offset(self) -> Result<Self, ArrowError> {
   ```
   
   Or something, if we move this to be an implementation detail of IPC this effectively becomes an implementation detail



##########
arrow-ipc/src/reader.rs:
##########
@@ -194,6 +194,50 @@ fn create_array(
             };
             Arc::new(struct_array)
         }
+        RunEndEncoded(run_ends_field, values_field) => {
+            let run_node = nodes.get(node_index);
+            node_index += 1;
+
+            let run_ends_triple = create_array(
+                nodes,
+                run_ends_field,
+                data,
+                buffers,
+                dictionaries_by_id,
+                node_index,
+                buffer_index,
+                compression_codec,
+                metadata,
+            )?;
+            node_index = run_ends_triple.1;
+            buffer_index = run_ends_triple.2;
+
+            let values_triple = create_array(
+                nodes,
+                values_field,
+                data,
+                buffers,
+                dictionaries_by_id,
+                node_index,
+                buffer_index,
+                compression_codec,
+                metadata,
+            )?;
+            node_index = values_triple.1;
+            buffer_index = values_triple.2;
+
+            let run_array_length = run_node.length() as usize;
+            let run_array_null_count = run_node.null_count() as usize;
+            let data = ArrayData::builder(data_type.clone())
+                .len(run_array_length)
+                .null_count(run_array_null_count)
+                .offset(0)
+                .add_child_data(run_ends_triple.0.into_data())
+                .add_child_data(values_triple.0.into_data())
+                .build()?;

Review Comment:
   :+1: 



##########
arrow-array/src/array/run_array.rs:
##########
@@ -552,6 +651,33 @@ mod tests {
         result
     }
 
+    fn compare_logical_and_physical_indices(

Review Comment:
   I found it very hard to work out what this method was doing, some docstrings would go a long way



##########
arrow-data/src/equal/run.rs:
##########
@@ -0,0 +1,94 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::data::ArrayData;
+
+use super::equal_range;
+
+/// The current implementation of comparison of run array support partial comparison.

Review Comment:
   ```suggestion
   /// The current implementation of comparison of run array support physical comparison.
   ```



##########
arrow-array/src/array/run_array.rs:
##########
@@ -824,23 +998,82 @@ mod tests {
             assert_eq!(logical_indices.len(), physical_indices.len());
 
             // check value in logical index in the input_array matches physical index in typed_run_array
-            logical_indices
+            compare_logical_and_physical_indices(
+                &logical_indices,
+                &input_array,
+                &physical_indices,
+                physical_values_array,
+            );
+        }
+    }
+
+    #[test]
+    fn test_get_physical_indices_sliced() {
+        let total_len = 80;
+        let input_array = build_input_array(total_len);
+
+        // Encode the input_array to run array
+        let mut builder =
+            PrimitiveRunBuilder::<Int16Type, Int32Type>::with_capacity(input_array.len());
+        builder.extend(input_array.iter().copied());
+        let run_array = builder.finish();
+        let physical_values_array = as_primitive_array::<Int32Type>(run_array.values());
+
+        // test for all slice lengths.
+        for slice_len in 1..=total_len {
+            // create an array consisting of all the indices repeated twice and shuffled.
+            let mut logical_indices: Vec<u32> = (0_u32..(slice_len as u32)).collect();
+            // add same indices once more
+            logical_indices.append(&mut logical_indices.clone());
+            let mut rng = thread_rng();
+            logical_indices.shuffle(&mut rng);
+
+            // test for offset = 0 and slice length = slice_len
+            // slice the input array using which the run array was built.
+            let sliced_input_array: Vec<Option<i32>> =
+                input_array.iter().take(slice_len).copied().collect();
+
+            // slice the run array
+            let sliced_run_array: RunArray<Int16Type> =
+                run_array.slice(0, slice_len).into_data().into();
+
+            // Get physical indices.
+            let physical_indices = sliced_run_array
+                .get_physical_indices(&logical_indices)
+                .unwrap();
+
+            compare_logical_and_physical_indices(
+                &logical_indices,
+                &sliced_input_array,
+                &physical_indices,
+                physical_values_array,
+            );
+
+            // test for offset = total_len - slice_len and slice length = slice_len
+            // slice the input array using which the run array was built.
+            let sliced_input_array: Vec<Option<i32>> = input_array

Review Comment:
   ```suggestion
               let sliced_input_array = &input_array[total_len - slice_len..total_len]
   ```



##########
arrow-data/src/equal/run.rs:
##########
@@ -0,0 +1,94 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::data::ArrayData;
+
+use super::equal_range;
+
+/// The current implementation of comparison of run array support partial comparison.
+/// Comparing run encoded array based on logical indices (`lhs_start`, `rhs_start`) will
+/// be time consuming as converting from logical index to physical index cannot be done
+/// in constat time. The current comparison compares the underlying physical arrays.
+pub(super) fn run_equal(
+    lhs: &ArrayData,
+    rhs: &ArrayData,
+    lhs_start: usize,
+    rhs_start: usize,
+    len: usize,
+) -> bool {
+    if lhs_start != 0
+        || rhs_start != 0
+        || (lhs.len() != len && rhs.len() != len)
+        || lhs.offset() > 0
+        || rhs.offset() > 0
+    {
+        unimplemented!("Partial comparison for run array not supported.")
+    }
+
+    if lhs.len() != rhs.len() {
+        return false;
+    }
+
+    // This method does validation of the lhs array and rhs array required to do its
+    // function. This method does not ensure the validity of lhs and rhs array as run array.

Review Comment:
   We can safely assume validity, there shouldn't be a safe way to create a invalid ArrayData



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org