You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by su...@apache.org on 2019/03/05 17:57:09 UTC
[arrow] branch master updated: ARROW-4769: [Rust] Improve array limit fn where max_records >= len

This is an automated email from the ASF dual-hosted git repository.

sunchao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 22cdd42  ARROW-4769: [Rust] Improve array limit fn where max_records >= len
22cdd42 is described below

commit 22cdd4252371218ee254445ba8b69fea719cc16b
Author: Neville Dipale <ne...@gmail.com>
AuthorDate: Tue Mar 5 09:56:55 2019 -0800

    ARROW-4769: [Rust] Improve array limit fn where max_records >= len
    
    This yields a ~55% reduction in runtime.
    
    Author: Neville Dipale <ne...@gmail.com>
    
    Closes #3811 from nevi-me/ARROW-4769 and squashes the following commits:
    
    6cfdfe41 <Neville Dipale> ARROW-4769:  Improve array limit fn where max_records >= len
---
 rust/arrow/benches/arithmetic_kernels.rs | 15 +++++++-
 rust/arrow/src/compute/array_ops.rs      | 59 +++++++++++++++++---------------
 rust/datafusion/src/execution/limit.rs   |  2 +-
 3 files changed, 47 insertions(+), 29 deletions(-)

diff --git a/rust/arrow/benches/arithmetic_kernels.rs b/rust/arrow/benches/arithmetic_kernels.rs
index be6d0ae..dd1c435 100644
--- a/rust/arrow/benches/arithmetic_kernels.rs
+++ b/rust/arrow/benches/arithmetic_kernels.rs
@@ -19,12 +19,14 @@
 extern crate criterion;
 use criterion::Criterion;
 
+use std::sync::Arc;
+
 extern crate arrow;
 
 use arrow::array::*;
 use arrow::builder::*;
 use arrow::compute::arithmetic_kernels::*;
-use arrow::compute::array_ops::sum;
+use arrow::compute::array_ops::{limit, sum};
 use arrow::error::Result;
 
 fn create_array(size: usize) -> Float32Array {
@@ -67,6 +69,11 @@ fn sum_no_simd(size: usize) {
     criterion::black_box(sum(&arr_a).unwrap());
 }
 
+fn limit_no_simd(size: usize, max: usize) {
+    let arr_a: ArrayRef = Arc::new(create_array(size));
+    criterion::black_box(limit(&arr_a, max).unwrap());
+}
+
 fn add_benchmark(c: &mut Criterion) {
     c.bench_function("add 512", |b| {
         b.iter(|| bin_op_no_simd(512, |a, b| Ok(a + b)))
@@ -81,6 +88,12 @@ fn add_benchmark(c: &mut Criterion) {
     });
     c.bench_function("multiply 512 simd", |b| b.iter(|| multiply_simd(512)));
     c.bench_function("sum 512 no simd", |b| b.iter(|| sum_no_simd(512)));
+    c.bench_function("limit 512, 256 no simd", |b| {
+        b.iter(|| limit_no_simd(512, 256))
+    });
+    c.bench_function("limit 512, 512 no simd", |b| {
+        b.iter(|| limit_no_simd(512, 512))
+    });
 }
 
 criterion_group!(benches, add_benchmark);
diff --git a/rust/arrow/src/compute/array_ops.rs b/rust/arrow/src/compute/array_ops.rs
index dc1730f..088661d 100644
--- a/rust/arrow/src/compute/array_ops.rs
+++ b/rust/arrow/src/compute/array_ops.rs
@@ -17,7 +17,6 @@
 
 //! Defines primitive computations on arrays, e.g. addition, equality, boolean logic.
 
-use std::cmp;
 use std::ops::Add;
 use std::sync::Arc;
 
@@ -201,26 +200,29 @@ macro_rules! limit_array {
 
 /// Returns the array, taking only the number of elements specified
 ///
-/// Returns the whole array if the number of elements specified is larger than the length of the array
-pub fn limit(array: &Array, num_elements: usize) -> Result<ArrayRef> {
-    let num_elements_safe: usize = cmp::min(array.len(), num_elements);
+/// Returns the whole array if the number of elements specified is larger than the length
+/// of the array
+pub fn limit(array: &ArrayRef, num_elements: usize) -> Result<ArrayRef> {
+    if num_elements >= array.len() {
+        return Ok(array.clone());
+    }
 
     match array.data_type() {
-        DataType::UInt8 => limit_array!(array, num_elements_safe, UInt8Array),
-        DataType::UInt16 => limit_array!(array, num_elements_safe, UInt16Array),
-        DataType::UInt32 => limit_array!(array, num_elements_safe, UInt32Array),
-        DataType::UInt64 => limit_array!(array, num_elements_safe, UInt64Array),
-        DataType::Int8 => limit_array!(array, num_elements_safe, Int8Array),
-        DataType::Int16 => limit_array!(array, num_elements_safe, Int16Array),
-        DataType::Int32 => limit_array!(array, num_elements_safe, Int32Array),
-        DataType::Int64 => limit_array!(array, num_elements_safe, Int64Array),
-        DataType::Float32 => limit_array!(array, num_elements_safe, Float32Array),
-        DataType::Float64 => limit_array!(array, num_elements_safe, Float64Array),
-        DataType::Boolean => limit_array!(array, num_elements_safe, BooleanArray),
+        DataType::UInt8 => limit_array!(array, num_elements, UInt8Array),
+        DataType::UInt16 => limit_array!(array, num_elements, UInt16Array),
+        DataType::UInt32 => limit_array!(array, num_elements, UInt32Array),
+        DataType::UInt64 => limit_array!(array, num_elements, UInt64Array),
+        DataType::Int8 => limit_array!(array, num_elements, Int8Array),
+        DataType::Int16 => limit_array!(array, num_elements, Int16Array),
+        DataType::Int32 => limit_array!(array, num_elements, Int32Array),
+        DataType::Int64 => limit_array!(array, num_elements, Int64Array),
+        DataType::Float32 => limit_array!(array, num_elements, Float32Array),
+        DataType::Float64 => limit_array!(array, num_elements, Float64Array),
+        DataType::Boolean => limit_array!(array, num_elements, BooleanArray),
         DataType::Utf8 => {
             let b = array.as_any().downcast_ref::<BinaryArray>().unwrap();
-            let mut values: Vec<&[u8]> = Vec::with_capacity(num_elements_safe);
-            for i in 0..num_elements_safe {
+            let mut values: Vec<&[u8]> = Vec::with_capacity(num_elements);
+            for i in 0..num_elements {
                 values.push(b.value(i));
             }
             Ok(Arc::new(BinaryArray::from(values)))
@@ -235,7 +237,9 @@ pub fn limit(array: &Array, num_elements: usize) -> Result<ArrayRef> {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::array::{Float64Array, Int32Array};
+    use crate::array::{ArrayRef, Float64Array, Int32Array};
+
+    use std::sync::Arc;
 
     #[test]
     fn test_primitive_array_sum() {
@@ -309,7 +313,7 @@ mod tests {
 
     #[test]
     fn test_limit_array() {
-        let a = Int32Array::from(vec![5, 6, 7, 8, 9]);
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![5, 6, 7, 8, 9]));
         let b = limit(&a, 3).unwrap();
         let c = b.as_ref().as_any().downcast_ref::<Int32Array>().unwrap();
         assert_eq!(3, c.len());
@@ -320,7 +324,7 @@ mod tests {
 
     #[test]
     fn test_limit_binary_array() {
-        let a = BinaryArray::from(vec!["hello", " ", "world", "!"]);
+        let a: ArrayRef = Arc::new(BinaryArray::from(vec!["hello", " ", "world", "!"]));
         let b = limit(&a, 2).unwrap();
         let c = b.as_ref().as_any().downcast_ref::<BinaryArray>().unwrap();
         assert_eq!(2, c.len());
@@ -330,7 +334,7 @@ mod tests {
 
     #[test]
     fn test_limit_array_with_null() {
-        let a = Int32Array::from(vec![None, Some(5)]);
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![None, Some(5)]));
         let b = limit(&a, 1).unwrap();
         let c = b.as_ref().as_any().downcast_ref::<Int32Array>().unwrap();
         assert_eq!(1, c.len());
@@ -340,14 +344,15 @@ mod tests {
     #[test]
     fn test_limit_array_with_limit_too_large() {
         let a = Int32Array::from(vec![5, 6, 7, 8, 9]);
-        let b = limit(&a, 6).unwrap();
+        let a_ref: ArrayRef = Arc::new(a);
+        let b = limit(&a_ref, 6).unwrap();
         let c = b.as_ref().as_any().downcast_ref::<Int32Array>().unwrap();
 
         assert_eq!(5, c.len());
-        assert_eq!(a.value(0), c.value(0));
-        assert_eq!(a.value(1), c.value(1));
-        assert_eq!(a.value(2), c.value(2));
-        assert_eq!(a.value(3), c.value(3));
-        assert_eq!(a.value(4), c.value(4));
+        assert_eq!(5, c.value(0));
+        assert_eq!(6, c.value(1));
+        assert_eq!(7, c.value(2));
+        assert_eq!(8, c.value(3));
+        assert_eq!(9, c.value(4));
     }
 }
diff --git a/rust/datafusion/src/execution/limit.rs b/rust/datafusion/src/execution/limit.rs
index 888fac5..bfd8706 100644
--- a/rust/datafusion/src/execution/limit.rs
+++ b/rust/datafusion/src/execution/limit.rs
@@ -59,7 +59,7 @@ impl Relation for LimitRelation {
 
                 if batch.num_rows() >= capacity {
                     let limited_columns: Result<Vec<ArrayRef>> = (0..batch.num_columns())
-                        .map(|i| match limit(batch.column(i).as_ref(), capacity) {
+                        .map(|i| match limit(batch.column(i), capacity) {
                             Ok(result) => Ok(result),
                             Err(error) => Err(ExecutionError::from(error)),
                         })