You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by jo...@apache.org on 2020/12/21 06:52:30 UTC

[arrow] branch master updated: ARROW-10947: [Rust][DataFusion] Optimize UTF8 to Date32 Conversion

This is an automated email from the ASF dual-hosted git repository.

jorgecarleitao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 5f73bd5  ARROW-10947: [Rust][DataFusion] Optimize UTF8 to Date32 Conversion
5f73bd5 is described below

commit 5f73bd5bdd0742195d89375848faa2cb88109881
Author: Mike Seddon <se...@gmail.com>
AuthorDate: Mon Dec 21 06:51:00 2020 +0000

    ARROW-10947: [Rust][DataFusion] Optimize UTF8 to Date32 Conversion
    
    After adding benchmarking capability to the UTF8 to Date32/Date64 CAST functions there was opportunity to improve the performance.
    
    This PR uses inbuilt `chrono` functionality to calculate the number of days since CE then uses a constant to calculate the offset days relative to 1970-01-01. This improves performance around 10% for this operation relative to the `since` function presumably as `chrono` does not have to ensure the `from_ymd` is a valid date.
    
    Before:
    ```
    cast utf8 to date32 512 time:   [41.966 us 42.508 us 43.087 us]
    cast utf8 to date32 512 time:   [40.591 us 40.661 us 40.740 us]
    cast utf8 to date32 512 time:   [40.825 us 40.878 us 40.916 us]
    ```
    
    After:
    ```
    cast utf8 to date32 512 time:   [36.557 us 36.839 us 37.200 us]
    cast utf8 to date32 512 time:   [35.997 us 36.442 us 36.919 us]
    cast utf8 to date32 512 time:   [35.750 us 35.969 us 36.160 us]
    ```
    
    Closes #8943 from seddonm1/utf8-date32-optimize
    
    Authored-by: Mike Seddon <se...@gmail.com>
    Signed-off-by: Jorge C. Leitao <jo...@gmail.com>
---
 rust/arrow/benches/cast_kernels.rs     | 59 +++++++++++++++++++++++++++++++++-
 rust/arrow/src/compute/kernels/cast.rs | 24 +++++++-------
 rust/arrow/src/csv/reader.rs           | 10 +++---
 3 files changed, 75 insertions(+), 18 deletions(-)

diff --git a/rust/arrow/benches/cast_kernels.rs b/rust/arrow/benches/cast_kernels.rs
index dbad552..7f6acd2 100644
--- a/rust/arrow/benches/cast_kernels.rs
+++ b/rust/arrow/benches/cast_kernels.rs
@@ -18,8 +18,9 @@
 #[macro_use]
 extern crate criterion;
 use criterion::Criterion;
-use rand::distributions::{Distribution, Standard};
+use rand::distributions::{Distribution, Standard, Uniform};
 use rand::prelude::random;
+use rand::Rng;
 
 use std::sync::Arc;
 
@@ -28,6 +29,7 @@ extern crate arrow;
 use arrow::array::*;
 use arrow::compute::cast;
 use arrow::datatypes::*;
+use arrow::util::test_util::seedable_rng;
 
 fn build_array<FROM>(size: usize) -> ArrayRef
 where
@@ -67,6 +69,48 @@ where
     Arc::new(PrimitiveArray::<FROM>::from_opt_vec(values, None))
 }
 
+fn build_utf8_date_array(size: usize, with_nulls: bool) -> ArrayRef {
+    use chrono::NaiveDate;
+
+    // use random numbers to avoid spurious compiler optimizations wrt to branching
+    let mut rng = seedable_rng();
+    let mut builder = StringBuilder::new(size);
+    let range = Uniform::new(0, 737776);
+
+    for _ in 0..size {
+        if with_nulls && rng.gen::<f32>() > 0.8 {
+            builder.append_null().unwrap();
+        } else {
+            let string = NaiveDate::from_num_days_from_ce(rng.sample(range))
+                .format("%Y-%m-%d")
+                .to_string();
+            builder.append_value(&string).unwrap();
+        }
+    }
+    Arc::new(builder.finish())
+}
+
+fn build_utf8_date_time_array(size: usize, with_nulls: bool) -> ArrayRef {
+    use chrono::NaiveDateTime;
+
+    // use random numbers to avoid spurious compiler optimizations wrt to branching
+    let mut rng = seedable_rng();
+    let mut builder = StringBuilder::new(size);
+    let range = Uniform::new(0, 1608071414123);
+
+    for _ in 0..size {
+        if with_nulls && rng.gen::<f32>() > 0.8 {
+            builder.append_null().unwrap();
+        } else {
+            let string = NaiveDateTime::from_timestamp(rng.sample(range), 0)
+                .format("%Y-%m-%dT%H:%M:%S")
+                .to_string();
+            builder.append_value(&string).unwrap();
+        }
+    }
+    Arc::new(builder.finish())
+}
+
 // cast array from specified primitive array type to desired data type
 fn cast_array(array: &ArrayRef, to_type: DataType) {
     criterion::black_box(cast(array, &to_type).unwrap());
@@ -83,6 +127,8 @@ fn add_benchmark(c: &mut Criterion) {
     let time64ns_array = build_array::<Time64NanosecondType>(512);
     let time_ns_array = build_timestamp_array::<TimestampNanosecondType>(512);
     let time_ms_array = build_timestamp_array::<TimestampMillisecondType>(512);
+    let utf8_date_array = build_utf8_date_array(512, true);
+    let utf8_date_time_array = build_utf8_date_time_array(512, true);
 
     c.bench_function("cast int32 to int32 512", |b| {
         b.iter(|| cast_array(&i32_array, DataType::Int32))
@@ -145,6 +191,17 @@ fn add_benchmark(c: &mut Criterion) {
     c.bench_function("cast timestamp_ms to i64 512", |b| {
         b.iter(|| cast_array(&time_ms_array, DataType::Int64))
     });
+    c.bench_function("cast utf8 to date32 512", |b| {
+        b.iter(|| cast_array(&utf8_date_array, DataType::Date32(DateUnit::Day)))
+    });
+    c.bench_function("cast utf8 to date64 512", |b| {
+        b.iter(|| {
+            cast_array(
+                &utf8_date_time_array,
+                DataType::Date64(DateUnit::Millisecond),
+            )
+        })
+    });
 }
 
 criterion_group!(benches, add_benchmark);
diff --git a/rust/arrow/src/compute/kernels/cast.rs b/rust/arrow/src/compute/kernels/cast.rs
index 2546990..c2d4aa2 100644
--- a/rust/arrow/src/compute/kernels/cast.rs
+++ b/rust/arrow/src/compute/kernels/cast.rs
@@ -380,7 +380,7 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result<ArrayRef> {
             Float32 => cast_string_to_numeric::<Float32Type>(array),
             Float64 => cast_string_to_numeric::<Float64Type>(array),
             Date32(DateUnit::Day) => {
-                let zero_time = chrono::NaiveTime::from_hms(0, 0, 0);
+                use chrono::Datelike;
                 let string_array = array.as_any().downcast_ref::<StringArray>().unwrap();
                 let mut builder = PrimitiveBuilder::<Date32Type>::new(string_array.len());
                 for i in 0..string_array.len() {
@@ -389,8 +389,7 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> Result<ArrayRef> {
                     } else {
                         match string_array.value(i).parse::<chrono::NaiveDate>() {
                             Ok(date) => builder.append_value(
-                                (date.and_time(zero_time).timestamp() / SECONDS_IN_DAY)
-                                    as i32,
+                                date.num_days_from_ce() - EPOCH_DAYS_FROM_CE,
                             )?,
                             Err(_) => builder.append_null()?, // not a valid date
                         };
@@ -835,6 +834,8 @@ const MICROSECONDS: i64 = 1_000_000;
 const NANOSECONDS: i64 = 1_000_000_000;
 /// Number of milliseconds in a day
 const MILLISECONDS_IN_DAY: i64 = SECONDS_IN_DAY * MILLISECONDS;
+/// Number of days between 0001-01-01 and 1970-01-01
+const EPOCH_DAYS_FROM_CE: i32 = 719_163;
 
 /// Cast an array by changing its array_data type to the desired type
 ///
@@ -2761,7 +2762,9 @@ mod tests {
 
     #[test]
     fn test_cast_utf8_to_date32() {
-        use chrono::{NaiveDate, NaiveTime};
+        use chrono::NaiveDate;
+        let from_ymd = chrono::NaiveDate::from_ymd;
+        let since = chrono::NaiveDate::signed_duration_since;
 
         let a = StringArray::from(vec![
             "2000-01-01",          // valid date with leading 0s
@@ -2774,19 +2777,14 @@ mod tests {
         let b = cast(&array, &DataType::Date32(DateUnit::Day)).unwrap();
         let c = b.as_any().downcast_ref::<Date32Array>().unwrap();
 
-        let zero_time = NaiveTime::from_hms(0, 0, 0);
         // test valid inputs
-        let date_value = (NaiveDate::from_ymd(2000, 1, 1)
-            .and_time(zero_time)
-            .timestamp()
-            / SECONDS_IN_DAY) as i32;
+        let date_value = since(NaiveDate::from_ymd(2000, 1, 1), from_ymd(1970, 1, 1))
+            .num_days() as i32;
         assert_eq!(true, c.is_valid(0)); // "2000-01-01"
         assert_eq!(date_value, c.value(0));
 
-        let date_value = (NaiveDate::from_ymd(2000, 2, 2)
-            .and_time(zero_time)
-            .timestamp()
-            / SECONDS_IN_DAY) as i32;
+        let date_value = since(NaiveDate::from_ymd(2000, 2, 2), from_ymd(1970, 1, 1))
+            .num_days() as i32;
         assert_eq!(true, c.is_valid(1)); // "2000-2-2"
         assert_eq!(date_value, c.value(1));
 
diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs
index d91fef4..ac779b0 100644
--- a/rust/arrow/src/csv/reader.rs
+++ b/rust/arrow/src/csv/reader.rs
@@ -509,15 +509,17 @@ impl Parser for Int16Type {}
 
 impl Parser for Int8Type {}
 
+/// Number of days between 0001-01-01 and 1970-01-01
+const EPOCH_DAYS_FROM_CE: i32 = 719_163;
+
 impl Parser for Date32Type {
     fn parse(string: &str) -> Option<i32> {
-        let from_ymd = chrono::NaiveDate::from_ymd;
-        let since = chrono::NaiveDate::signed_duration_since;
+        use chrono::Datelike;
 
         match Self::DATA_TYPE {
             DataType::Date32(DateUnit::Day) => {
-                let days = string.parse::<chrono::NaiveDate>().ok()?;
-                Self::Native::from_i32(since(days, from_ymd(1970, 1, 1)).num_days() as i32)
+                let date = string.parse::<chrono::NaiveDate>().ok()?;
+                Self::Native::from_i32(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE)
             }
             _ => None,
         }