You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2021/01/01 10:50:16 UTC

[arrow] branch master updated: ARROW-11035: [Rust] Improved performance of casting to utf8

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new fcc2227  ARROW-11035: [Rust] Improved performance of casting to utf8
fcc2227 is described below

commit fcc222708eaee06e8a17f45b3368ffe290130861
Author: Jorge C. Leitao <jo...@gmail.com>
AuthorDate: Fri Jan 1 05:49:27 2021 -0500

    ARROW-11035: [Rust] Improved performance of casting to utf8
    
    ```
    cast i64 to string 512  time:   [92.618 us 92.839 us 93.097 us]
                            change: [-14.915% -14.287% -13.743%] (p = 0.00 < 0.05)
                            Performance has improved.
    Found 5 outliers among 100 measurements (5.00%)
    ```
    
    Closes #9014 from jorgecarleitao/speed_cast
    
    Authored-by: Jorge C. Leitao <jo...@gmail.com>
    Signed-off-by: Andrew Lamb <an...@nerdnetworks.org>
---
 rust/arrow/benches/cast_kernels.rs     |  3 ++
 rust/arrow/src/compute/kernels/cast.rs | 69 ++++++++++++----------------------
 rust/arrow/src/csv/reader.rs           | 15 ++------
 rust/arrow/src/json/reader.rs          | 23 ++++--------
 4 files changed, 39 insertions(+), 71 deletions(-)

diff --git a/rust/arrow/benches/cast_kernels.rs b/rust/arrow/benches/cast_kernels.rs
index b9e33cc..81232e5 100644
--- a/rust/arrow/benches/cast_kernels.rs
+++ b/rust/arrow/benches/cast_kernels.rs
@@ -193,6 +193,9 @@ fn add_benchmark(c: &mut Criterion) {
     c.bench_function("cast utf8 to f32", |b| {
         b.iter(|| cast_array(&f32_utf8_array, DataType::Float32))
     });
+    c.bench_function("cast i64 to string 512", |b| {
+        b.iter(|| cast_array(&i64_array, DataType::Utf8))
+    });
 
     c.bench_function("cast timestamp_ms to i64 512", |b| {
         b.iter(|| cast_array(&time_ms_array, DataType::Int64))
diff --git a/rust/arrow/src/compute/kernels/cast.rs b/rust/arrow/src/compute/kernels/cast.rs
index f112876..6725fc0 100644
--- a/rust/arrow/src/compute/kernels/cast.rs
+++ b/rust/arrow/src/compute/kernels/cast.rs
@@ -351,17 +351,13 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result<ArrayRef> {
             Float32 => cast_bool_to_numeric::<Float32Type>(array),
             Float64 => cast_bool_to_numeric::<Float64Type>(array),
             Utf8 => {
-                let from = array.as_any().downcast_ref::<BooleanArray>().unwrap();
-                let mut b = StringBuilder::new(array.len());
-                for i in 0..array.len() {
-                    if array.is_null(i) {
-                        b.append(false)?;
-                    } else {
-                        b.append_value(if from.value(i) { "1" } else { "0" })?;
-                    }
-                }
-
-                Ok(Arc::new(b.finish()) as ArrayRef)
+                let array = array.as_any().downcast_ref::<BooleanArray>().unwrap();
+                Ok(Arc::new(
+                    array
+                        .iter()
+                        .map(|value| value.map(|value| if value { "1" } else { "0" }))
+                        .collect::<StringArray>(),
+                ))
             }
             _ => Err(ArrowError::ComputeError(format!(
                 "Casting from {:?} to {:?} not supported",
@@ -431,20 +427,15 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result<ArrayRef> {
             Float32 => cast_numeric_to_string::<Float32Type>(array),
             Float64 => cast_numeric_to_string::<Float64Type>(array),
             Binary => {
-                let from = array.as_any().downcast_ref::<BinaryArray>().unwrap();
-                let mut b = StringBuilder::new(array.len());
-                for i in 0..array.len() {
-                    if array.is_null(i) {
-                        b.append_null()?;
-                    } else {
-                        match str::from_utf8(from.value(i)) {
-                            Ok(s) => b.append_value(s)?,
-                            Err(_) => b.append_null()?, // not valid UTF8
-                        }
-                    }
-                }
-
-                Ok(Arc::new(b.finish()) as ArrayRef)
+                let array = array.as_any().downcast_ref::<BinaryArray>().unwrap();
+                Ok(Arc::new(
+                    array
+                        .iter()
+                        .map(|maybe_value| {
+                            maybe_value.and_then(|value| str::from_utf8(value).ok())
+                        })
+                        .collect::<StringArray>(),
+                ))
             }
             _ => Err(ArrowError::ComputeError(format!(
                 "Casting from {:?} to {:?} not supported",
@@ -892,31 +883,22 @@ where
     FROM: ArrowNumericType,
     FROM::Native: std::string::ToString,
 {
-    numeric_to_string_cast::<FROM>(
+    Ok(Arc::new(numeric_to_string_cast::<FROM>(
         array
             .as_any()
             .downcast_ref::<PrimitiveArray<FROM>>()
             .unwrap(),
-    )
-    .map(|to| Arc::new(to) as ArrayRef)
+    )))
 }
 
-fn numeric_to_string_cast<T>(from: &PrimitiveArray<T>) -> Result<StringArray>
+fn numeric_to_string_cast<T>(from: &PrimitiveArray<T>) -> StringArray
 where
     T: ArrowPrimitiveType + ArrowNumericType,
     T::Native: std::string::ToString,
 {
-    let mut b = StringBuilder::new(from.len());
-
-    for i in 0..from.len() {
-        if from.is_null(i) {
-            b.append(false)?;
-        } else {
-            b.append_value(&from.value(i).to_string())?;
-        }
-    }
-
-    Ok(b.finish())
+    from.iter()
+        .map(|maybe_value| maybe_value.map(|value| value.to_string()))
+        .collect()
 }
 
 /// Cast numeric types to Utf8
@@ -2714,11 +2696,8 @@ mod tests {
     fn test_cast_string_array_to_dict() {
         use DataType::*;
 
-        let mut builder = StringBuilder::new(10);
-        builder.append_value("one").unwrap();
-        builder.append_null().unwrap();
-        builder.append_value("three").unwrap();
-        let array: ArrayRef = Arc::new(builder.finish());
+        let array = Arc::new(StringArray::from(vec![Some("one"), None, Some("three")]))
+            as ArrayRef;
 
         let expected = vec!["one", "null", "three"];
 
diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs
index ac779b0..3fca7b2 100644
--- a/rust/arrow/src/csv/reader.rs
+++ b/rust/arrow/src/csv/reader.rs
@@ -51,7 +51,7 @@ use std::sync::Arc;
 
 use csv as csv_crate;
 
-use crate::array::{ArrayRef, BooleanArray, PrimitiveArray, StringBuilder};
+use crate::array::{ArrayRef, BooleanArray, PrimitiveArray, StringArray};
 use crate::datatypes::*;
 use crate::error::{ArrowError, Result};
 use crate::record_batch::RecordBatch;
@@ -449,16 +449,9 @@ fn parse(
                 &DataType::Date64(_) => {
                     build_primitive_array::<Date64Type>(line_number, rows, i)
                 }
-                &DataType::Utf8 => {
-                    let mut builder = StringBuilder::new(rows.len());
-                    for row in rows.iter() {
-                        match row.get(i) {
-                            Some(s) => builder.append_value(s).unwrap(),
-                            _ => builder.append(false).unwrap(),
-                        }
-                    }
-                    Ok(Arc::new(builder.finish()) as ArrayRef)
-                }
+                &DataType::Utf8 => Ok(Arc::new(
+                    rows.iter().map(|row| row.get(i)).collect::<StringArray>(),
+                ) as ArrayRef),
                 other => Err(ArrowError::ParseError(format!(
                     "Unsupported data type {:?}",
                     other
diff --git a/rust/arrow/src/json/reader.rs b/rust/arrow/src/json/reader.rs
index 39f35bb..7ef418c 100644
--- a/rust/arrow/src/json/reader.rs
+++ b/rust/arrow/src/json/reader.rs
@@ -1124,21 +1124,14 @@ impl Decoder {
                             t
                         ))),
                     },
-                    DataType::Utf8 => {
-                        let mut builder = StringBuilder::new(rows.len());
-                        for row in rows {
-                            if let Some(value) = row.get(field.name()) {
-                                if let Some(str_v) = value.as_str() {
-                                    builder.append_value(str_v)?
-                                } else {
-                                    builder.append(false)?
-                                }
-                            } else {
-                                builder.append(false)?
-                            }
-                        }
-                        Ok(Arc::new(builder.finish()) as ArrayRef)
-                    }
+                    DataType::Utf8 => Ok(Arc::new(
+                        rows.iter()
+                            .map(|row| {
+                                let maybe_value = row.get(field.name());
+                                maybe_value.and_then(|value| value.as_str())
+                            })
+                            .collect::<StringArray>(),
+                    ) as ArrayRef),
                     DataType::List(ref list_field) => {
                         match list_field.data_type() {
                             DataType::Dictionary(ref key_ty, _) => {