You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/05/10 20:18:07 UTC

[GitHub] [arrow-rs] alamb commented on a diff in pull request #1684: simplify offsets checking

alamb commented on code in PR #1684:
URL: https://github.com/apache/arrow-rs/pull/1684#discussion_r869644804


##########
arrow/src/array/data.rs:
##########
@@ -1033,72 +1035,61 @@ impl ArrayData {
     }
 
     /// Calls the `validate(item_index, range)` function for each of
-    /// the ranges specified in the arrow offset buffer of type
+    /// the ranges specified in the arrow offsets buffer of type
     /// `T`. Also validates that each offset is smaller than
-    /// `max_offset`
+    /// `offset_limit`
+    ///
+    /// For an empty array, the offsets buffer can either be empty
+    /// or contain a single `0`.
     ///
-    /// For example, the offset buffer contained `[1, 2, 4]`, this
+    /// For example, the offsets buffer contained `[1, 2, 4]`, this
     /// function would call `validate([1,2])`, and `validate([2,4])`
     fn validate_each_offset<T, V>(
         &self,
-        offset_buffer: &Buffer,
+        offsets_buffer: &Buffer,
         offset_limit: usize,
         validate: V,
     ) -> Result<()>
     where
         T: ArrowNativeType + std::convert::TryInto<usize> + num::Num + std::fmt::Display,
         V: Fn(usize, Range<usize>) -> Result<()>,
     {
-        // An empty binary-like array can have 0 offsets
-        if self.len == 0 && offset_buffer.is_empty() {
-            return Ok(());
-        }
-
-        let offsets = self.typed_offsets::<T>(offset_buffer)?;
-
-        offsets
+        let offsets = self.typed_offsets::<T>(offsets_buffer)?
             .iter()
-            .zip(offsets.iter().skip(1))
             .enumerate()
-            .map(|(i, (&start_offset, &end_offset))| {
-                let start_offset: usize = start_offset
-                    .try_into()
-                    .map_err(|_| {
-                        ArrowError::InvalidArgumentError(format!(
-                            "Offset invariant failure: could not convert start_offset {} to usize in slot {}",
-                            start_offset, i))
-                    })?;
-                let end_offset: usize = end_offset
-                    .try_into()
-                    .map_err(|_| {
-                        ArrowError::InvalidArgumentError(format!(
-                            "Offset invariant failure: Could not convert end_offset {} to usize in slot {}",
-                            end_offset, i+1))
-                    })?;
-
-                if start_offset > offset_limit {
-                    return Err(ArrowError::InvalidArgumentError(format!(
-                        "Offset invariant failure: offset for slot {} out of bounds: {} > {}",
-                        i, start_offset, offset_limit))
-                    );
-                }
-
-                if end_offset > offset_limit {
-                    return Err(ArrowError::InvalidArgumentError(format!(
-                        "Offset invariant failure: offset for slot {} out of bounds: {} > {}",
-                        i, end_offset, offset_limit))
+            .map(|(i, x)| {
+                // check if the offset can be converted to usize
+                let r = x.to_usize().ok_or_else(|| {
+                    ArrowError::InvalidArgumentError(format!(
+                        "Offset invariant failure: Could not convert offset {} to usize at position {}",
+                        x, i))}
                     );
+                // check if the offset exceeds the limit
+                match r {
+                    Ok(n) if n <= offset_limit => Ok(n),
+                    Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
+                        "Offset invariant failure: offset at position {} out of bounds: {} > {}",
+                        i, x, offset_limit))
+                    ),
+                    err => err,
                 }
+            })
+            .collect::<Result<Vec<usize>>>()?;

Review Comment:
   The downside of calling `collect()` is that it will buffer the intermediate result (aka allocate memory and save the offsets in another buffer)



##########
arrow/src/array/data.rs:
##########
@@ -1821,6 +1812,90 @@ mod tests {
         .unwrap();
     }
 
+    #[test]
+    fn test_empty_utf8_array_with_empty_offsets_buffer() {
+        let data_buffer = Buffer::from(&[]);
+        let offsets_buffer = Buffer::from(&[]);
+        ArrayData::try_new(
+            DataType::Utf8,
+            0,
+            None,
+            None,
+            0,
+            vec![offsets_buffer, data_buffer],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    fn test_empty_utf8_array_with_single_zero_offset() {
+        let data_buffer = Buffer::from(&[]);
+        let offsets_buffer = Buffer::from_slice_ref(&[0i32]);
+        ArrayData::try_new(
+            DataType::Utf8,
+            0,
+            None,
+            None,
+            0,
+            vec![offsets_buffer, data_buffer],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "First offset 1 of Utf8 is larger than values length 0")]
+    fn test_empty_utf8_array_with_invalid_offset() {
+        let data_buffer = Buffer::from(&[]);
+        let offsets_buffer = Buffer::from_slice_ref(&[1i32]);
+        ArrayData::try_new(
+            DataType::Utf8,
+            0,
+            None,
+            None,
+            0,
+            vec![offsets_buffer, data_buffer],
+            vec![],
+        )
+        .unwrap();
+    }
+
+    #[test]
+    fn test_empty_utf8_array_with_non_zero_offset() {
+        let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes());
+        let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2, 6, 0]);

Review Comment:
   yeah, I agree this is strange but ok



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org