You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2021/01/01 10:50:16 UTC
[arrow] branch master updated: ARROW-11035: [Rust] Improved
performance of casting to utf8
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new fcc2227 ARROW-11035: [Rust] Improved performance of casting to utf8
fcc2227 is described below
commit fcc222708eaee06e8a17f45b3368ffe290130861
Author: Jorge C. Leitao <jo...@gmail.com>
AuthorDate: Fri Jan 1 05:49:27 2021 -0500
ARROW-11035: [Rust] Improved performance of casting to utf8
```
cast i64 to string 512 time: [92.618 us 92.839 us 93.097 us]
change: [-14.915% -14.287% -13.743%] (p = 0.00 < 0.05)
Performance has improved.
Found 5 outliers among 100 measurements (5.00%)
```
Closes #9014 from jorgecarleitao/speed_cast
Authored-by: Jorge C. Leitao <jo...@gmail.com>
Signed-off-by: Andrew Lamb <an...@nerdnetworks.org>
---
rust/arrow/benches/cast_kernels.rs | 3 ++
rust/arrow/src/compute/kernels/cast.rs | 69 ++++++++++++----------------------
rust/arrow/src/csv/reader.rs | 15 ++------
rust/arrow/src/json/reader.rs | 23 ++++--------
4 files changed, 39 insertions(+), 71 deletions(-)
diff --git a/rust/arrow/benches/cast_kernels.rs b/rust/arrow/benches/cast_kernels.rs
index b9e33cc..81232e5 100644
--- a/rust/arrow/benches/cast_kernels.rs
+++ b/rust/arrow/benches/cast_kernels.rs
@@ -193,6 +193,9 @@ fn add_benchmark(c: &mut Criterion) {
c.bench_function("cast utf8 to f32", |b| {
b.iter(|| cast_array(&f32_utf8_array, DataType::Float32))
});
+ c.bench_function("cast i64 to string 512", |b| {
+ b.iter(|| cast_array(&i64_array, DataType::Utf8))
+ });
c.bench_function("cast timestamp_ms to i64 512", |b| {
b.iter(|| cast_array(&time_ms_array, DataType::Int64))
diff --git a/rust/arrow/src/compute/kernels/cast.rs b/rust/arrow/src/compute/kernels/cast.rs
index f112876..6725fc0 100644
--- a/rust/arrow/src/compute/kernels/cast.rs
+++ b/rust/arrow/src/compute/kernels/cast.rs
@@ -351,17 +351,13 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result<ArrayRef> {
Float32 => cast_bool_to_numeric::<Float32Type>(array),
Float64 => cast_bool_to_numeric::<Float64Type>(array),
Utf8 => {
- let from = array.as_any().downcast_ref::<BooleanArray>().unwrap();
- let mut b = StringBuilder::new(array.len());
- for i in 0..array.len() {
- if array.is_null(i) {
- b.append(false)?;
- } else {
- b.append_value(if from.value(i) { "1" } else { "0" })?;
- }
- }
-
- Ok(Arc::new(b.finish()) as ArrayRef)
+ let array = array.as_any().downcast_ref::<BooleanArray>().unwrap();
+ Ok(Arc::new(
+ array
+ .iter()
+ .map(|value| value.map(|value| if value { "1" } else { "0" }))
+ .collect::<StringArray>(),
+ ))
}
_ => Err(ArrowError::ComputeError(format!(
"Casting from {:?} to {:?} not supported",
@@ -431,20 +427,15 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result<ArrayRef> {
Float32 => cast_numeric_to_string::<Float32Type>(array),
Float64 => cast_numeric_to_string::<Float64Type>(array),
Binary => {
- let from = array.as_any().downcast_ref::<BinaryArray>().unwrap();
- let mut b = StringBuilder::new(array.len());
- for i in 0..array.len() {
- if array.is_null(i) {
- b.append_null()?;
- } else {
- match str::from_utf8(from.value(i)) {
- Ok(s) => b.append_value(s)?,
- Err(_) => b.append_null()?, // not valid UTF8
- }
- }
- }
-
- Ok(Arc::new(b.finish()) as ArrayRef)
+ let array = array.as_any().downcast_ref::<BinaryArray>().unwrap();
+ Ok(Arc::new(
+ array
+ .iter()
+ .map(|maybe_value| {
+ maybe_value.and_then(|value| str::from_utf8(value).ok())
+ })
+ .collect::<StringArray>(),
+ ))
}
_ => Err(ArrowError::ComputeError(format!(
"Casting from {:?} to {:?} not supported",
@@ -892,31 +883,22 @@ where
FROM: ArrowNumericType,
FROM::Native: std::string::ToString,
{
- numeric_to_string_cast::<FROM>(
+ Ok(Arc::new(numeric_to_string_cast::<FROM>(
array
.as_any()
.downcast_ref::<PrimitiveArray<FROM>>()
.unwrap(),
- )
- .map(|to| Arc::new(to) as ArrayRef)
+ )))
}
-fn numeric_to_string_cast<T>(from: &PrimitiveArray<T>) -> Result<StringArray>
+fn numeric_to_string_cast<T>(from: &PrimitiveArray<T>) -> StringArray
where
T: ArrowPrimitiveType + ArrowNumericType,
T::Native: std::string::ToString,
{
- let mut b = StringBuilder::new(from.len());
-
- for i in 0..from.len() {
- if from.is_null(i) {
- b.append(false)?;
- } else {
- b.append_value(&from.value(i).to_string())?;
- }
- }
-
- Ok(b.finish())
+ from.iter()
+ .map(|maybe_value| maybe_value.map(|value| value.to_string()))
+ .collect()
}
/// Cast numeric types to Utf8
@@ -2714,11 +2696,8 @@ mod tests {
fn test_cast_string_array_to_dict() {
use DataType::*;
- let mut builder = StringBuilder::new(10);
- builder.append_value("one").unwrap();
- builder.append_null().unwrap();
- builder.append_value("three").unwrap();
- let array: ArrayRef = Arc::new(builder.finish());
+ let array = Arc::new(StringArray::from(vec![Some("one"), None, Some("three")]))
+ as ArrayRef;
let expected = vec!["one", "null", "three"];
diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs
index ac779b0..3fca7b2 100644
--- a/rust/arrow/src/csv/reader.rs
+++ b/rust/arrow/src/csv/reader.rs
@@ -51,7 +51,7 @@ use std::sync::Arc;
use csv as csv_crate;
-use crate::array::{ArrayRef, BooleanArray, PrimitiveArray, StringBuilder};
+use crate::array::{ArrayRef, BooleanArray, PrimitiveArray, StringArray};
use crate::datatypes::*;
use crate::error::{ArrowError, Result};
use crate::record_batch::RecordBatch;
@@ -449,16 +449,9 @@ fn parse(
&DataType::Date64(_) => {
build_primitive_array::<Date64Type>(line_number, rows, i)
}
- &DataType::Utf8 => {
- let mut builder = StringBuilder::new(rows.len());
- for row in rows.iter() {
- match row.get(i) {
- Some(s) => builder.append_value(s).unwrap(),
- _ => builder.append(false).unwrap(),
- }
- }
- Ok(Arc::new(builder.finish()) as ArrayRef)
- }
+ &DataType::Utf8 => Ok(Arc::new(
+ rows.iter().map(|row| row.get(i)).collect::<StringArray>(),
+ ) as ArrayRef),
other => Err(ArrowError::ParseError(format!(
"Unsupported data type {:?}",
other
diff --git a/rust/arrow/src/json/reader.rs b/rust/arrow/src/json/reader.rs
index 39f35bb..7ef418c 100644
--- a/rust/arrow/src/json/reader.rs
+++ b/rust/arrow/src/json/reader.rs
@@ -1124,21 +1124,14 @@ impl Decoder {
t
))),
},
- DataType::Utf8 => {
- let mut builder = StringBuilder::new(rows.len());
- for row in rows {
- if let Some(value) = row.get(field.name()) {
- if let Some(str_v) = value.as_str() {
- builder.append_value(str_v)?
- } else {
- builder.append(false)?
- }
- } else {
- builder.append(false)?
- }
- }
- Ok(Arc::new(builder.finish()) as ArrayRef)
- }
+ DataType::Utf8 => Ok(Arc::new(
+ rows.iter()
+ .map(|row| {
+ let maybe_value = row.get(field.name());
+ maybe_value.and_then(|value| value.as_str())
+ })
+ .collect::<StringArray>(),
+ ) as ArrayRef),
DataType::List(ref list_field) => {
match list_field.data_type() {
DataType::Dictionary(ref key_ty, _) => {