You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2021/06/09 18:09:51 UTC

[arrow-rs] branch active_release updated: Fix out of bounds read in bit chunk iterator (#416) (#432)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch active_release
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/active_release by this push:
     new 8179193  Fix out of bounds read in bit chunk iterator (#416) (#432)
8179193 is described below

commit 8179193a0f1cc4602d05e201b4f1ae16142eec10
Author: Andrew Lamb <an...@nerdnetworks.org>
AuthorDate: Wed Jun 9 14:09:45 2021 -0400

    Fix out of bounds read in bit chunk iterator (#416) (#432)
    
    * Fix out of bounds read in bit chunk iterator
    
    * Add comment why reading one additional byte is enough
    
    Co-authored-by: Jörn Horstmann <gi...@jhorstmann.net>
---
 arrow/benches/boolean_kernels.rs     | 19 +++++++++++++++++++
 arrow/src/util/bit_chunk_iterator.rs | 36 +++++++++++++++++++++++++++---------
 2 files changed, 46 insertions(+), 9 deletions(-)

diff --git a/arrow/benches/boolean_kernels.rs b/arrow/benches/boolean_kernels.rs
index 6559c4e..e9394f7 100644
--- a/arrow/benches/boolean_kernels.rs
+++ b/arrow/benches/boolean_kernels.rs
@@ -45,6 +45,25 @@ fn add_benchmark(c: &mut Criterion) {
     c.bench_function("and", |b| b.iter(|| bench_and(&array1, &array2)));
     c.bench_function("or", |b| b.iter(|| bench_or(&array1, &array2)));
     c.bench_function("not", |b| b.iter(|| bench_not(&array1)));
+
+    let array1_slice = array1.slice(1, size - 1);
+    let array1_slice = array1_slice
+        .as_any()
+        .downcast_ref::<BooleanArray>()
+        .unwrap();
+    let array2_slice = array2.slice(1, size - 1);
+    let array2_slice = array2_slice
+        .as_any()
+        .downcast_ref::<BooleanArray>()
+        .unwrap();
+
+    c.bench_function("and_sliced", |b| {
+        b.iter(|| bench_and(&array1_slice, &array2_slice))
+    });
+    c.bench_function("or_sliced", |b| {
+        b.iter(|| bench_or(&array1_slice, &array2_slice))
+    });
+    c.bench_function("not_sliced", |b| b.iter(|| bench_not(&array1_slice)));
 }
 
 criterion_group!(benches, add_benchmark);
diff --git a/arrow/src/util/bit_chunk_iterator.rs b/arrow/src/util/bit_chunk_iterator.rs
index b9145b7..ea9280c 100644
--- a/arrow/src/util/bit_chunk_iterator.rs
+++ b/arrow/src/util/bit_chunk_iterator.rs
@@ -35,10 +35,10 @@ impl<'a> BitChunks<'a> {
         let byte_offset = offset / 8;
         let bit_offset = offset % 8;
 
-        let chunk_bits = 8 * std::mem::size_of::<u64>();
-
-        let chunk_len = len / chunk_bits;
-        let remainder_len = len & (chunk_bits - 1);
+        // number of complete u64 chunks
+        let chunk_len = len / 64;
+        // number of remaining bits
+        let remainder_len = len % 64;
 
         BitChunks::<'a> {
             buffer: &buffer[byte_offset..],
@@ -137,14 +137,18 @@ impl Iterator for BitChunkIterator<'_> {
         // so when reading as u64 on a big-endian machine, the bytes need to be swapped
         let current = unsafe { std::ptr::read_unaligned(raw_data.add(index)).to_le() };
 
-        let combined = if self.bit_offset == 0 {
+        let bit_offset = self.bit_offset;
+
+        let combined = if bit_offset == 0 {
             current
         } else {
-            let next =
-                unsafe { std::ptr::read_unaligned(raw_data.add(index + 1)).to_le() };
+            // the constructor ensures that bit_offset is in 0..8
+            // that means we need to read at most one additional byte to fill in the high bits
+            let next = unsafe {
+                std::ptr::read_unaligned(raw_data.add(index + 1) as *const u8) as u64
+            };
 
-            current >> self.bit_offset
-                | (next & ((1 << self.bit_offset) - 1)) << (64 - self.bit_offset)
+            (current >> bit_offset) | (next << (64 - bit_offset))
         };
 
         self.index = index + 1;
@@ -254,4 +258,18 @@ mod tests {
             bitchunks.remainder_bits()
         );
     }
+
+    #[test]
+    fn test_iter_remainder_out_of_bounds() {
+        // allocating a full page should trigger a fault when reading out of bounds
+        const ALLOC_SIZE: usize = 4 * 1024;
+        let input = vec![0xFF_u8; ALLOC_SIZE];
+
+        let buffer: Buffer = Buffer::from(input);
+
+        let bitchunks = buffer.bit_chunks(57, ALLOC_SIZE * 8 - 57);
+
+        assert_eq!(u64::MAX, bitchunks.iter().last().unwrap());
+        assert_eq!(0x7F, bitchunks.remainder_bits());
+    }
 }