You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2021/05/26 21:38:40 UTC

[arrow-rs] branch active_release updated: respect offset in utf8 and list casts (#335) (#358)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch active_release
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/active_release by this push:
     new 9e15050  respect offset in utf8 and list casts (#335) (#358)
9e15050 is described below

commit 9e1505063954a41e64fc5a54b1dd1c216b491d7c
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Wed May 26 17:38:33 2021 -0400

    respect offset in utf8 and list casts (#335) (#358)
    
    Co-authored-by: Ritchie Vink <ri...@gmail.com>
---
 arrow/src/compute/kernels/cast.rs | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs
index de1516b..49543b8 100644
--- a/arrow/src/compute/kernels/cast.rs
+++ b/arrow/src/compute/kernels/cast.rs
@@ -1687,6 +1687,7 @@ where
     };
 
     let mut builder = ArrayData::builder(dtype)
+        .offset(array.offset())
         .len(array.len())
         .add_buffer(offset_buffer)
         .add_buffer(str_values_buf);
@@ -1744,7 +1745,12 @@ where
         _ => unreachable!(),
     };
 
-    let offsets = data.buffer::<OffsetSizeFrom>(0);
+    // Safety:
+    //      The first buffer is the offsets and they are aligned to OffSetSizeFrom: (i64 or i32)
+    // Justification:
+    //      The safe variant data.buffer::<OffsetSizeFrom> take the offset into account and we
+    //      cannot create a list array with offsets starting at non zero.
+    let offsets = unsafe { data.buffers()[0].as_slice().align_to::<OffsetSizeFrom>() }.1;
 
     let iter = offsets.iter().map(|idx| {
         let idx: OffsetSizeTo = NumCast::from(*idx).unwrap();
@@ -1757,6 +1763,7 @@ where
 
     // wrap up
     let mut builder = ArrayData::builder(out_dtype)
+        .offset(array.offset())
         .len(array.len())
         .add_buffer(offset_buffer)
         .add_child_data(value_data);
@@ -3840,4 +3847,30 @@ mod tests {
             Dictionary(Box::new(DataType::UInt32), Box::new(DataType::Utf8)),
         ]
     }
+
+    #[test]
+    fn test_utf8_cast_offsets() {
+        // test if offset of the array is taken into account during cast
+        let str_array = StringArray::from(vec!["a", "b", "c"]);
+        let str_array = str_array.slice(1, 2);
+
+        let out = cast(&str_array, &DataType::LargeUtf8).unwrap();
+
+        let large_str_array = out.as_any().downcast_ref::<LargeStringArray>().unwrap();
+        let strs = large_str_array.into_iter().flatten().collect::<Vec<_>>();
+        assert_eq!(strs, &["b", "c"])
+    }
+
+    #[test]
+    fn test_list_cast_offsets() {
+        // test if offset of the array is taken into account during cast
+        let array1 = make_list_array().slice(1, 2);
+        let array2 = Arc::new(make_list_array()) as ArrayRef;
+
+        let dt = DataType::LargeList(Box::new(Field::new("item", DataType::Int32, true)));
+        let out1 = cast(&array1, &dt).unwrap();
+        let out2 = cast(&array2, &dt).unwrap();
+
+        assert_eq!(&out1, &out2.slice(1, 2))
+    }
 }