You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by jo...@apache.org on 2020/12/21 06:52:30 UTC
[arrow] branch master updated: ARROW-10947: [Rust][DataFusion]
Optimize UTF8 to Date32 Conversion
This is an automated email from the ASF dual-hosted git repository.
jorgecarleitao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 5f73bd5 ARROW-10947: [Rust][DataFusion] Optimize UTF8 to Date32 Conversion
5f73bd5 is described below
commit 5f73bd5bdd0742195d89375848faa2cb88109881
Author: Mike Seddon <se...@gmail.com>
AuthorDate: Mon Dec 21 06:51:00 2020 +0000
ARROW-10947: [Rust][DataFusion] Optimize UTF8 to Date32 Conversion
After adding benchmarking capability to the UTF8 to Date32/Date64 CAST functions there was opportunity to improve the performance.
This PR uses inbuilt `chrono` functionality to calculate the number of days since CE then uses a constant to calculate the offset days relative to 1970-01-01. This improves performance around 10% for this operation relative to the `since` function presumably as `chrono` does not have to ensure the `from_ymd` is a valid date.
Before:
```
cast utf8 to date32 512 time: [41.966 us 42.508 us 43.087 us]
cast utf8 to date32 512 time: [40.591 us 40.661 us 40.740 us]
cast utf8 to date32 512 time: [40.825 us 40.878 us 40.916 us]
```
After:
```
cast utf8 to date32 512 time: [36.557 us 36.839 us 37.200 us]
cast utf8 to date32 512 time: [35.997 us 36.442 us 36.919 us]
cast utf8 to date32 512 time: [35.750 us 35.969 us 36.160 us]
```
Closes #8943 from seddonm1/utf8-date32-optimize
Authored-by: Mike Seddon <se...@gmail.com>
Signed-off-by: Jorge C. Leitao <jo...@gmail.com>
---
rust/arrow/benches/cast_kernels.rs | 59 +++++++++++++++++++++++++++++++++-
rust/arrow/src/compute/kernels/cast.rs | 24 +++++++-------
rust/arrow/src/csv/reader.rs | 10 +++---
3 files changed, 75 insertions(+), 18 deletions(-)
diff --git a/rust/arrow/benches/cast_kernels.rs b/rust/arrow/benches/cast_kernels.rs
index dbad552..7f6acd2 100644
--- a/rust/arrow/benches/cast_kernels.rs
+++ b/rust/arrow/benches/cast_kernels.rs
@@ -18,8 +18,9 @@
#[macro_use]
extern crate criterion;
use criterion::Criterion;
-use rand::distributions::{Distribution, Standard};
+use rand::distributions::{Distribution, Standard, Uniform};
use rand::prelude::random;
+use rand::Rng;
use std::sync::Arc;
@@ -28,6 +29,7 @@ extern crate arrow;
use arrow::array::*;
use arrow::compute::cast;
use arrow::datatypes::*;
+use arrow::util::test_util::seedable_rng;
fn build_array<FROM>(size: usize) -> ArrayRef
where
@@ -67,6 +69,48 @@ where
Arc::new(PrimitiveArray::<FROM>::from_opt_vec(values, None))
}
+fn build_utf8_date_array(size: usize, with_nulls: bool) -> ArrayRef {
+ use chrono::NaiveDate;
+
+ // use random numbers to avoid spurious compiler optimizations wrt to branching
+ let mut rng = seedable_rng();
+ let mut builder = StringBuilder::new(size);
+ let range = Uniform::new(0, 737776);
+
+ for _ in 0..size {
+ if with_nulls && rng.gen::<f32>() > 0.8 {
+ builder.append_null().unwrap();
+ } else {
+ let string = NaiveDate::from_num_days_from_ce(rng.sample(range))
+ .format("%Y-%m-%d")
+ .to_string();
+ builder.append_value(&string).unwrap();
+ }
+ }
+ Arc::new(builder.finish())
+}
+
+fn build_utf8_date_time_array(size: usize, with_nulls: bool) -> ArrayRef {
+ use chrono::NaiveDateTime;
+
+ // use random numbers to avoid spurious compiler optimizations wrt to branching
+ let mut rng = seedable_rng();
+ let mut builder = StringBuilder::new(size);
+ let range = Uniform::new(0, 1608071414123);
+
+ for _ in 0..size {
+ if with_nulls && rng.gen::<f32>() > 0.8 {
+ builder.append_null().unwrap();
+ } else {
+ let string = NaiveDateTime::from_timestamp(rng.sample(range), 0)
+ .format("%Y-%m-%dT%H:%M:%S")
+ .to_string();
+ builder.append_value(&string).unwrap();
+ }
+ }
+ Arc::new(builder.finish())
+}
+
// cast array from specified primitive array type to desired data type
fn cast_array(array: &ArrayRef, to_type: DataType) {
criterion::black_box(cast(array, &to_type).unwrap());
@@ -83,6 +127,8 @@ fn add_benchmark(c: &mut Criterion) {
let time64ns_array = build_array::<Time64NanosecondType>(512);
let time_ns_array = build_timestamp_array::<TimestampNanosecondType>(512);
let time_ms_array = build_timestamp_array::<TimestampMillisecondType>(512);
+ let utf8_date_array = build_utf8_date_array(512, true);
+ let utf8_date_time_array = build_utf8_date_time_array(512, true);
c.bench_function("cast int32 to int32 512", |b| {
b.iter(|| cast_array(&i32_array, DataType::Int32))
@@ -145,6 +191,17 @@ fn add_benchmark(c: &mut Criterion) {
c.bench_function("cast timestamp_ms to i64 512", |b| {
b.iter(|| cast_array(&time_ms_array, DataType::Int64))
});
+ c.bench_function("cast utf8 to date32 512", |b| {
+ b.iter(|| cast_array(&utf8_date_array, DataType::Date32(DateUnit::Day)))
+ });
+ c.bench_function("cast utf8 to date64 512", |b| {
+ b.iter(|| {
+ cast_array(
+ &utf8_date_time_array,
+ DataType::Date64(DateUnit::Millisecond),
+ )
+ })
+ });
}
criterion_group!(benches, add_benchmark);
diff --git a/rust/arrow/src/compute/kernels/cast.rs b/rust/arrow/src/compute/kernels/cast.rs
index 2546990..c2d4aa2 100644
--- a/rust/arrow/src/compute/kernels/cast.rs
+++ b/rust/arrow/src/compute/kernels/cast.rs
@@ -380,7 +380,7 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result<ArrayRef> {
Float32 => cast_string_to_numeric::<Float32Type>(array),
Float64 => cast_string_to_numeric::<Float64Type>(array),
Date32(DateUnit::Day) => {
- let zero_time = chrono::NaiveTime::from_hms(0, 0, 0);
+ use chrono::Datelike;
let string_array = array.as_any().downcast_ref::<StringArray>().unwrap();
let mut builder = PrimitiveBuilder::<Date32Type>::new(string_array.len());
for i in 0..string_array.len() {
@@ -389,8 +389,7 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result<ArrayRef> {
} else {
match string_array.value(i).parse::<chrono::NaiveDate>() {
Ok(date) => builder.append_value(
- (date.and_time(zero_time).timestamp() / SECONDS_IN_DAY)
- as i32,
+ date.num_days_from_ce() - EPOCH_DAYS_FROM_CE,
)?,
Err(_) => builder.append_null()?, // not a valid date
};
@@ -835,6 +834,8 @@ const MICROSECONDS: i64 = 1_000_000;
const NANOSECONDS: i64 = 1_000_000_000;
/// Number of milliseconds in a day
const MILLISECONDS_IN_DAY: i64 = SECONDS_IN_DAY * MILLISECONDS;
+/// Number of days between 0001-01-01 and 1970-01-01
+const EPOCH_DAYS_FROM_CE: i32 = 719_163;
/// Cast an array by changing its array_data type to the desired type
///
@@ -2761,7 +2762,9 @@ mod tests {
#[test]
fn test_cast_utf8_to_date32() {
- use chrono::{NaiveDate, NaiveTime};
+ use chrono::NaiveDate;
+ let from_ymd = chrono::NaiveDate::from_ymd;
+ let since = chrono::NaiveDate::signed_duration_since;
let a = StringArray::from(vec![
"2000-01-01", // valid date with leading 0s
@@ -2774,19 +2777,14 @@ mod tests {
let b = cast(&array, &DataType::Date32(DateUnit::Day)).unwrap();
let c = b.as_any().downcast_ref::<Date32Array>().unwrap();
- let zero_time = NaiveTime::from_hms(0, 0, 0);
// test valid inputs
- let date_value = (NaiveDate::from_ymd(2000, 1, 1)
- .and_time(zero_time)
- .timestamp()
- / SECONDS_IN_DAY) as i32;
+ let date_value = since(NaiveDate::from_ymd(2000, 1, 1), from_ymd(1970, 1, 1))
+ .num_days() as i32;
assert_eq!(true, c.is_valid(0)); // "2000-01-01"
assert_eq!(date_value, c.value(0));
- let date_value = (NaiveDate::from_ymd(2000, 2, 2)
- .and_time(zero_time)
- .timestamp()
- / SECONDS_IN_DAY) as i32;
+ let date_value = since(NaiveDate::from_ymd(2000, 2, 2), from_ymd(1970, 1, 1))
+ .num_days() as i32;
assert_eq!(true, c.is_valid(1)); // "2000-2-2"
assert_eq!(date_value, c.value(1));
diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs
index d91fef4..ac779b0 100644
--- a/rust/arrow/src/csv/reader.rs
+++ b/rust/arrow/src/csv/reader.rs
@@ -509,15 +509,17 @@ impl Parser for Int16Type {}
impl Parser for Int8Type {}
+/// Number of days between 0001-01-01 and 1970-01-01
+const EPOCH_DAYS_FROM_CE: i32 = 719_163;
+
impl Parser for Date32Type {
fn parse(string: &str) -> Option<i32> {
- let from_ymd = chrono::NaiveDate::from_ymd;
- let since = chrono::NaiveDate::signed_duration_since;
+ use chrono::Datelike;
match Self::DATA_TYPE {
DataType::Date32(DateUnit::Day) => {
- let days = string.parse::<chrono::NaiveDate>().ok()?;
- Self::Native::from_i32(since(days, from_ymd(1970, 1, 1)).num_days() as i32)
+ let date = string.parse::<chrono::NaiveDate>().ok()?;
+ Self::Native::from_i32(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE)
}
_ => None,
}