You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "vojtechtoman (via GitHub)" <gi...@apache.org> on 2024/03/13 15:29:09 UTC

[PR] Optimize make_date (#9089) [arrow-datafusion]

vojtechtoman opened a new pull request, #9600:
URL: https://github.com/apache/arrow-datafusion/pull/9600

   ## Which issue does this PR close?
   
   Closes #9089.
   
   ## What changes are included in this PR?
   
   This PR introduces further optimizations in `make_date`:
   
   - replace the expensive calculation of `unix_days_from_ce` with a constant
   - do not use `PrimitiveArray` builder for the scalar case
   
   ## Are these changes tested?
   
   No new tests needed (no changes to functionality, all covered by existing tests).
   
   Performance is tracked by existing benchmarks for `make_date`. Compared to the previous implementation, this PR shows (on my machine) about 10% improvement for the cases involving arrays and about 20% for the scalars-only case.
   
   ## Are there any user-facing changes?
   
   N/A
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] Optimize make_date (#9089) [arrow-datafusion]

Posted by "alamb (via GitHub)" <gi...@apache.org>.
alamb commented on PR #9600:
URL: https://github.com/apache/arrow-datafusion/pull/9600#issuecomment-2001965807

   I took the liberty of merging this PR up from main to resolve conflicts, and implemented the suggestion in https://github.com/apache/arrow-datafusion/pull/9600#discussion_r1525043083 while I had it checked out


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] Optimize make_date (#9089) [arrow-datafusion]

Posted by "alamb (via GitHub)" <gi...@apache.org>.
alamb commented on PR #9600:
URL: https://github.com/apache/arrow-datafusion/pull/9600#issuecomment-1995128460

   I think there is already a benchmark:
   
   ```shell
   cargo bench --bench make_date
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] Optimize make_date (#9089) [arrow-datafusion]

Posted by "comphead (via GitHub)" <gi...@apache.org>.
comphead commented on code in PR #9600:
URL: https://github.com/apache/arrow-datafusion/pull/9600#discussion_r1525044121


##########
datafusion/physical-expr/src/datetime_expressions.rs:
##########
@@ -351,51 +317,77 @@ pub fn make_date(args: &[ColumnarValue]) -> Result<ColumnarValue> {
         Ok(*i)
     };
 
-    // For scalar only columns the operation is faster without using the PrimitiveArray
-    if is_scalar {
-        construct_date_fn(
-            &mut builder,
-            scalar_value_fn(&years)?,
-            scalar_value_fn(&months)?,
-            scalar_value_fn(&days)?,
-            unix_days_from_ce,
-        )?;
-    } else {
-        let to_primitive_array = |col: &ColumnarValue,
-                                  scalar_count: usize|
-         -> Result<PrimitiveArray<Int32Type>> {
+    let value = if let Some(array_size) = len {
+        let to_primitive_array_fn = |col: &ColumnarValue| -> PrimitiveArray<Int32Type> {
             match col {
-                ColumnarValue::Array(a) => Ok(a.as_primitive::<Int32Type>().to_owned()),
+                ColumnarValue::Array(a) => a.as_primitive::<Int32Type>().to_owned(),
                 _ => {
                     let v = scalar_value_fn(col).unwrap();
-                    Ok(PrimitiveArray::<Int32Type>::from_value(v, scalar_count))
+                    PrimitiveArray::<Int32Type>::from_value(v, array_size)
                 }
             }
         };
 
-        let years = to_primitive_array(&years, array_size).unwrap();
-        let months = to_primitive_array(&months, array_size).unwrap();
-        let days = to_primitive_array(&days, array_size).unwrap();
+        let years = to_primitive_array_fn(&years);
+        let months = to_primitive_array_fn(&months);
+        let days = to_primitive_array_fn(&days);
+
+        let mut builder: PrimitiveBuilder<Date32Type> =
+            PrimitiveArray::builder(array_size);
         for i in 0..array_size {
-            construct_date_fn(
-                &mut builder,
+            process_date(
                 years.value(i),
                 months.value(i),
                 days.value(i),
-                unix_days_from_ce,
+                |days: i32| builder.append_value(days),
             )?;
         }
-    }
 
-    let arr = builder.finish();
+        let arr = builder.finish();
 
-    if is_scalar {
-        // If all inputs are scalar, keeps output as scalar
-        Ok(ColumnarValue::Scalar(ScalarValue::Date32(Some(
-            arr.value(0),
-        ))))
+        ColumnarValue::Array(Arc::new(arr))
+    } else {
+        // For scalar only columns the operation is faster without using the PrimitiveArray.
+        // Also, keep the output as scalar since all inputs are scalar.
+        let mut value = 0;
+        process_date(
+            scalar_value_fn(&years)?,
+            scalar_value_fn(&months)?,
+            scalar_value_fn(&days)?,
+            |days: i32| value = days,
+        )?;
+
+        ColumnarValue::Scalar(ScalarValue::Date32(Some(value)))
+    };
+
+    Ok(value)
+}
+
+fn process_date(
+    year: i32,
+    month: i32,
+    day: i32,
+    mut date_consumer_fn: impl FnMut(i32),
+) -> Result<()> {
+    let Ok(m) = u32::try_from(month) else {
+        return exec_err!("Month value '{month:?}' is out of range");
+    };
+    let Ok(d) = u32::try_from(day) else {

Review Comment:
   there is `u32` from `i32`, perhaps its better to align types?  



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] Optimize make_date (#9089) [arrow-datafusion]

Posted by "alamb (via GitHub)" <gi...@apache.org>.
alamb commented on PR #9600:
URL: https://github.com/apache/arrow-datafusion/pull/9600#issuecomment-1995129655

   Also I think https://github.com/apache/arrow-datafusion/pull/9601 moves this code and this will cause a conflict


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] Optimize make_date (#9089) [arrow-datafusion]

Posted by "comphead (via GitHub)" <gi...@apache.org>.
comphead commented on code in PR #9600:
URL: https://github.com/apache/arrow-datafusion/pull/9600#discussion_r1526487199


##########
datafusion/physical-expr/src/datetime_expressions.rs:
##########
@@ -351,51 +317,77 @@ pub fn make_date(args: &[ColumnarValue]) -> Result<ColumnarValue> {
         Ok(*i)
     };
 
-    // For scalar only columns the operation is faster without using the PrimitiveArray
-    if is_scalar {
-        construct_date_fn(
-            &mut builder,
-            scalar_value_fn(&years)?,
-            scalar_value_fn(&months)?,
-            scalar_value_fn(&days)?,
-            unix_days_from_ce,
-        )?;
-    } else {
-        let to_primitive_array = |col: &ColumnarValue,
-                                  scalar_count: usize|
-         -> Result<PrimitiveArray<Int32Type>> {
+    let value = if let Some(array_size) = len {
+        let to_primitive_array_fn = |col: &ColumnarValue| -> PrimitiveArray<Int32Type> {
             match col {
-                ColumnarValue::Array(a) => Ok(a.as_primitive::<Int32Type>().to_owned()),
+                ColumnarValue::Array(a) => a.as_primitive::<Int32Type>().to_owned(),
                 _ => {
                     let v = scalar_value_fn(col).unwrap();
-                    Ok(PrimitiveArray::<Int32Type>::from_value(v, scalar_count))
+                    PrimitiveArray::<Int32Type>::from_value(v, array_size)
                 }
             }
         };
 
-        let years = to_primitive_array(&years, array_size).unwrap();
-        let months = to_primitive_array(&months, array_size).unwrap();
-        let days = to_primitive_array(&days, array_size).unwrap();
+        let years = to_primitive_array_fn(&years);
+        let months = to_primitive_array_fn(&months);
+        let days = to_primitive_array_fn(&days);
+
+        let mut builder: PrimitiveBuilder<Date32Type> =
+            PrimitiveArray::builder(array_size);
         for i in 0..array_size {
-            construct_date_fn(
-                &mut builder,
+            process_date(
                 years.value(i),
                 months.value(i),
                 days.value(i),
-                unix_days_from_ce,
+                |days: i32| builder.append_value(days),
             )?;
         }
-    }
 
-    let arr = builder.finish();
+        let arr = builder.finish();
 
-    if is_scalar {
-        // If all inputs are scalar, keeps output as scalar
-        Ok(ColumnarValue::Scalar(ScalarValue::Date32(Some(
-            arr.value(0),
-        ))))
+        ColumnarValue::Array(Arc::new(arr))
+    } else {
+        // For scalar only columns the operation is faster without using the PrimitiveArray.
+        // Also, keep the output as scalar since all inputs are scalar.
+        let mut value = 0;
+        process_date(
+            scalar_value_fn(&years)?,
+            scalar_value_fn(&months)?,
+            scalar_value_fn(&days)?,
+            |days: i32| value = days,
+        )?;
+
+        ColumnarValue::Scalar(ScalarValue::Date32(Some(value)))
+    };
+
+    Ok(value)
+}
+
+fn process_date(
+    year: i32,
+    month: i32,
+    day: i32,
+    mut date_consumer_fn: impl FnMut(i32),
+) -> Result<()> {
+    let Ok(m) = u32::try_from(month) else {
+        return exec_err!("Month value '{month:?}' is out of range");
+    };
+    let Ok(d) = u32::try_from(day) else {

Review Comment:
   Thanks folks, my point was prob to have the method signature to receive `u32` instead of `i32`, the method works with dates and u32 more natural for the date imho



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] Optimize make_date (#9089) [arrow-datafusion]

Posted by "alamb (via GitHub)" <gi...@apache.org>.
alamb commented on PR #9600:
URL: https://github.com/apache/arrow-datafusion/pull/9600#issuecomment-2002399252

   Thanks again @vojtechtoman and @Omega359 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] Optimize make_date (#9089) [arrow-datafusion]

Posted by "vojtechtoman (via GitHub)" <gi...@apache.org>.
vojtechtoman commented on PR #9600:
URL: https://github.com/apache/arrow-datafusion/pull/9600#issuecomment-1999365452

   > If you are interested @vojtechtoman to_timestamp likely could use optimization as well (#9090)
   
   Sure, let me take a look.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] Optimize make_date (#9089) [arrow-datafusion]

Posted by "Omega359 (via GitHub)" <gi...@apache.org>.
Omega359 commented on PR #9600:
URL: https://github.com/apache/arrow-datafusion/pull/9600#issuecomment-1998331327

   make_date migration to functions has been merged to main.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] Optimize make_date (#9089) [arrow-datafusion]

Posted by "Omega359 (via GitHub)" <gi...@apache.org>.
Omega359 commented on PR #9600:
URL: https://github.com/apache/arrow-datafusion/pull/9600#issuecomment-1997660877

   I am seeing these results:
   ```
   ❯ cargo bench --bench make_date
      Compiling datafusion-functions v36.0.0 (/opt/dev/arrow-datafusion/datafusion/functions)
       Finished bench [optimized] target(s) in 1m 08s
        Running benches/make_date.rs (target/release/deps/make_date-f60eb5f26259ede9)
   Gnuplot not found, using plotters backend
   make_date_col_col_col_1000
                           time:   [6.7777 µs 6.8467 µs 6.9200 µs]
                           change: [-25.673% -23.503% -21.299%] (p = 0.00 < 0.05)
                           Performance has improved.
   Found 1 outliers among 100 measurements (1.00%)
     1 (1.00%) high mild
   
   make_date_scalar_col_col_1000
                           time:   [7.1573 µs 7.3016 µs 7.4539 µs]
                           change: [-18.183% -16.227% -14.454%] (p = 0.00 < 0.05)
                           Performance has improved.
   Found 2 outliers among 100 measurements (2.00%)
     1 (1.00%) high mild
     1 (1.00%) high severe
   
   make_date_scalar_scalar_col_1000
                           time:   [7.2130 µs 7.4215 µs 7.6690 µs]
                           change: [-14.990% -12.493% -9.6101%] (p = 0.00 < 0.05)
                           Performance has improved.
   Found 4 outliers among 100 measurements (4.00%)
     3 (3.00%) high mild
     1 (1.00%) high severe
     ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] Optimize make_date (#9089) [arrow-datafusion]

Posted by "alamb (via GitHub)" <gi...@apache.org>.
alamb merged PR #9600:
URL: https://github.com/apache/arrow-datafusion/pull/9600


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] Optimize make_date (#9089) [arrow-datafusion]

Posted by "Omega359 (via GitHub)" <gi...@apache.org>.
Omega359 commented on code in PR #9600:
URL: https://github.com/apache/arrow-datafusion/pull/9600#discussion_r1526190566


##########
datafusion/physical-expr/src/datetime_expressions.rs:
##########
@@ -351,51 +317,77 @@ pub fn make_date(args: &[ColumnarValue]) -> Result<ColumnarValue> {
         Ok(*i)
     };
 
-    // For scalar only columns the operation is faster without using the PrimitiveArray
-    if is_scalar {
-        construct_date_fn(
-            &mut builder,
-            scalar_value_fn(&years)?,
-            scalar_value_fn(&months)?,
-            scalar_value_fn(&days)?,
-            unix_days_from_ce,
-        )?;
-    } else {
-        let to_primitive_array = |col: &ColumnarValue,
-                                  scalar_count: usize|
-         -> Result<PrimitiveArray<Int32Type>> {
+    let value = if let Some(array_size) = len {
+        let to_primitive_array_fn = |col: &ColumnarValue| -> PrimitiveArray<Int32Type> {
             match col {
-                ColumnarValue::Array(a) => Ok(a.as_primitive::<Int32Type>().to_owned()),
+                ColumnarValue::Array(a) => a.as_primitive::<Int32Type>().to_owned(),
                 _ => {
                     let v = scalar_value_fn(col).unwrap();
-                    Ok(PrimitiveArray::<Int32Type>::from_value(v, scalar_count))
+                    PrimitiveArray::<Int32Type>::from_value(v, array_size)
                 }
             }
         };
 
-        let years = to_primitive_array(&years, array_size).unwrap();
-        let months = to_primitive_array(&months, array_size).unwrap();
-        let days = to_primitive_array(&days, array_size).unwrap();
+        let years = to_primitive_array_fn(&years);
+        let months = to_primitive_array_fn(&months);
+        let days = to_primitive_array_fn(&days);
+
+        let mut builder: PrimitiveBuilder<Date32Type> =
+            PrimitiveArray::builder(array_size);
         for i in 0..array_size {
-            construct_date_fn(
-                &mut builder,
+            process_date(
                 years.value(i),
                 months.value(i),
                 days.value(i),
-                unix_days_from_ce,
+                |days: i32| builder.append_value(days),
             )?;
         }
-    }
 
-    let arr = builder.finish();
+        let arr = builder.finish();
 
-    if is_scalar {
-        // If all inputs are scalar, keeps output as scalar
-        Ok(ColumnarValue::Scalar(ScalarValue::Date32(Some(
-            arr.value(0),
-        ))))
+        ColumnarValue::Array(Arc::new(arr))
+    } else {
+        // For scalar only columns the operation is faster without using the PrimitiveArray.
+        // Also, keep the output as scalar since all inputs are scalar.
+        let mut value = 0;
+        process_date(
+            scalar_value_fn(&years)?,
+            scalar_value_fn(&months)?,
+            scalar_value_fn(&days)?,
+            |days: i32| value = days,
+        )?;
+
+        ColumnarValue::Scalar(ScalarValue::Date32(Some(value)))
+    };
+
+    Ok(value)
+}
+
+fn process_date(
+    year: i32,
+    month: i32,
+    day: i32,
+    mut date_consumer_fn: impl FnMut(i32),
+) -> Result<()> {
+    let Ok(m) = u32::try_from(month) else {
+        return exec_err!("Month value '{month:?}' is out of range");
+    };
+    let Ok(d) = u32::try_from(day) else {

Review Comment:
   u32:: because NaiveDate::from_ymd_opt(year, m, d) requires that for m & d.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] Optimize make_date (#9089) [arrow-datafusion]

Posted by "vojtechtoman (via GitHub)" <gi...@apache.org>.
vojtechtoman commented on PR #9600:
URL: https://github.com/apache/arrow-datafusion/pull/9600#issuecomment-2003115600

   Thanks everyone!


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] Optimize make_date (#9089) [arrow-datafusion]

Posted by "Omega359 (via GitHub)" <gi...@apache.org>.
Omega359 commented on PR #9600:
URL: https://github.com/apache/arrow-datafusion/pull/9600#issuecomment-1998660847

   If you are interested @vojtechtoman to_timestamp likely could use optimization as well (#9090)


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] Optimize make_date (#9089) [arrow-datafusion]

Posted by "comphead (via GitHub)" <gi...@apache.org>.
comphead commented on code in PR #9600:
URL: https://github.com/apache/arrow-datafusion/pull/9600#discussion_r1525043083


##########
datafusion/physical-expr/src/datetime_expressions.rs:
##########
@@ -351,51 +317,77 @@ pub fn make_date(args: &[ColumnarValue]) -> Result<ColumnarValue> {
         Ok(*i)
     };
 
-    // For scalar only columns the operation is faster without using the PrimitiveArray
-    if is_scalar {
-        construct_date_fn(
-            &mut builder,
-            scalar_value_fn(&years)?,
-            scalar_value_fn(&months)?,
-            scalar_value_fn(&days)?,
-            unix_days_from_ce,
-        )?;
-    } else {
-        let to_primitive_array = |col: &ColumnarValue,
-                                  scalar_count: usize|
-         -> Result<PrimitiveArray<Int32Type>> {
+    let value = if let Some(array_size) = len {
+        let to_primitive_array_fn = |col: &ColumnarValue| -> PrimitiveArray<Int32Type> {
             match col {
-                ColumnarValue::Array(a) => Ok(a.as_primitive::<Int32Type>().to_owned()),
+                ColumnarValue::Array(a) => a.as_primitive::<Int32Type>().to_owned(),
                 _ => {
                     let v = scalar_value_fn(col).unwrap();
-                    Ok(PrimitiveArray::<Int32Type>::from_value(v, scalar_count))
+                    PrimitiveArray::<Int32Type>::from_value(v, array_size)
                 }
             }
         };
 
-        let years = to_primitive_array(&years, array_size).unwrap();
-        let months = to_primitive_array(&months, array_size).unwrap();
-        let days = to_primitive_array(&days, array_size).unwrap();
+        let years = to_primitive_array_fn(&years);
+        let months = to_primitive_array_fn(&months);
+        let days = to_primitive_array_fn(&days);
+
+        let mut builder: PrimitiveBuilder<Date32Type> =
+            PrimitiveArray::builder(array_size);
         for i in 0..array_size {
-            construct_date_fn(
-                &mut builder,
+            process_date(
                 years.value(i),
                 months.value(i),
                 days.value(i),
-                unix_days_from_ce,
+                |days: i32| builder.append_value(days),
             )?;
         }
-    }
 
-    let arr = builder.finish();
+        let arr = builder.finish();
 
-    if is_scalar {
-        // If all inputs are scalar, keeps output as scalar
-        Ok(ColumnarValue::Scalar(ScalarValue::Date32(Some(
-            arr.value(0),
-        ))))
+        ColumnarValue::Array(Arc::new(arr))
+    } else {
+        // For scalar only columns the operation is faster without using the PrimitiveArray.
+        // Also, keep the output as scalar since all inputs are scalar.
+        let mut value = 0;
+        process_date(
+            scalar_value_fn(&years)?,
+            scalar_value_fn(&months)?,
+            scalar_value_fn(&days)?,
+            |days: i32| value = days,
+        )?;
+
+        ColumnarValue::Scalar(ScalarValue::Date32(Some(value)))
+    };
+
+    Ok(value)
+}
+
+fn process_date(

Review Comment:
   Please have more intuitive name for the function.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] Optimize make_date (#9089) [arrow-datafusion]

Posted by "alamb (via GitHub)" <gi...@apache.org>.
alamb commented on code in PR #9600:
URL: https://github.com/apache/arrow-datafusion/pull/9600#discussion_r1526112075


##########
datafusion/physical-expr/src/datetime_expressions.rs:
##########
@@ -351,51 +317,77 @@ pub fn make_date(args: &[ColumnarValue]) -> Result<ColumnarValue> {
         Ok(*i)
     };
 
-    // For scalar only columns the operation is faster without using the PrimitiveArray
-    if is_scalar {
-        construct_date_fn(
-            &mut builder,
-            scalar_value_fn(&years)?,
-            scalar_value_fn(&months)?,
-            scalar_value_fn(&days)?,
-            unix_days_from_ce,
-        )?;
-    } else {
-        let to_primitive_array = |col: &ColumnarValue,
-                                  scalar_count: usize|
-         -> Result<PrimitiveArray<Int32Type>> {
+    let value = if let Some(array_size) = len {
+        let to_primitive_array_fn = |col: &ColumnarValue| -> PrimitiveArray<Int32Type> {
             match col {
-                ColumnarValue::Array(a) => Ok(a.as_primitive::<Int32Type>().to_owned()),
+                ColumnarValue::Array(a) => a.as_primitive::<Int32Type>().to_owned(),
                 _ => {
                     let v = scalar_value_fn(col).unwrap();
-                    Ok(PrimitiveArray::<Int32Type>::from_value(v, scalar_count))
+                    PrimitiveArray::<Int32Type>::from_value(v, array_size)
                 }
             }
         };
 
-        let years = to_primitive_array(&years, array_size).unwrap();
-        let months = to_primitive_array(&months, array_size).unwrap();
-        let days = to_primitive_array(&days, array_size).unwrap();
+        let years = to_primitive_array_fn(&years);
+        let months = to_primitive_array_fn(&months);
+        let days = to_primitive_array_fn(&days);
+
+        let mut builder: PrimitiveBuilder<Date32Type> =
+            PrimitiveArray::builder(array_size);
         for i in 0..array_size {
-            construct_date_fn(
-                &mut builder,
+            process_date(
                 years.value(i),
                 months.value(i),
                 days.value(i),
-                unix_days_from_ce,
+                |days: i32| builder.append_value(days),
             )?;
         }
-    }
 
-    let arr = builder.finish();
+        let arr = builder.finish();
 
-    if is_scalar {
-        // If all inputs are scalar, keeps output as scalar
-        Ok(ColumnarValue::Scalar(ScalarValue::Date32(Some(
-            arr.value(0),
-        ))))
+        ColumnarValue::Array(Arc::new(arr))
+    } else {
+        // For scalar only columns the operation is faster without using the PrimitiveArray.
+        // Also, keep the output as scalar since all inputs are scalar.
+        let mut value = 0;
+        process_date(
+            scalar_value_fn(&years)?,
+            scalar_value_fn(&months)?,
+            scalar_value_fn(&days)?,
+            |days: i32| value = days,
+        )?;
+
+        ColumnarValue::Scalar(ScalarValue::Date32(Some(value)))
+    };
+
+    Ok(value)
+}
+
+fn process_date(

Review Comment:
   Perhaps something like `make_date_inner`
   
   ```suggestion
   /// Converts the year/month/day fields to a `i32` 
   /// representing the days from the unix epoch
   /// and invokes date_consumer_fn with the value
   fn make_date_inner(
   ```



##########
datafusion/physical-expr/src/datetime_expressions.rs:
##########
@@ -351,51 +317,77 @@ pub fn make_date(args: &[ColumnarValue]) -> Result<ColumnarValue> {
         Ok(*i)
     };
 
-    // For scalar only columns the operation is faster without using the PrimitiveArray
-    if is_scalar {
-        construct_date_fn(
-            &mut builder,
-            scalar_value_fn(&years)?,
-            scalar_value_fn(&months)?,
-            scalar_value_fn(&days)?,
-            unix_days_from_ce,
-        )?;
-    } else {
-        let to_primitive_array = |col: &ColumnarValue,
-                                  scalar_count: usize|
-         -> Result<PrimitiveArray<Int32Type>> {
+    let value = if let Some(array_size) = len {
+        let to_primitive_array_fn = |col: &ColumnarValue| -> PrimitiveArray<Int32Type> {
             match col {
-                ColumnarValue::Array(a) => Ok(a.as_primitive::<Int32Type>().to_owned()),
+                ColumnarValue::Array(a) => a.as_primitive::<Int32Type>().to_owned(),
                 _ => {
                     let v = scalar_value_fn(col).unwrap();
-                    Ok(PrimitiveArray::<Int32Type>::from_value(v, scalar_count))
+                    PrimitiveArray::<Int32Type>::from_value(v, array_size)
                 }
             }
         };
 
-        let years = to_primitive_array(&years, array_size).unwrap();
-        let months = to_primitive_array(&months, array_size).unwrap();
-        let days = to_primitive_array(&days, array_size).unwrap();
+        let years = to_primitive_array_fn(&years);
+        let months = to_primitive_array_fn(&months);
+        let days = to_primitive_array_fn(&days);
+
+        let mut builder: PrimitiveBuilder<Date32Type> =
+            PrimitiveArray::builder(array_size);
         for i in 0..array_size {
-            construct_date_fn(
-                &mut builder,
+            process_date(
                 years.value(i),
                 months.value(i),
                 days.value(i),
-                unix_days_from_ce,
+                |days: i32| builder.append_value(days),
             )?;
         }
-    }
 
-    let arr = builder.finish();
+        let arr = builder.finish();
 
-    if is_scalar {
-        // If all inputs are scalar, keeps output as scalar
-        Ok(ColumnarValue::Scalar(ScalarValue::Date32(Some(
-            arr.value(0),
-        ))))
+        ColumnarValue::Array(Arc::new(arr))
+    } else {
+        // For scalar only columns the operation is faster without using the PrimitiveArray.
+        // Also, keep the output as scalar since all inputs are scalar.
+        let mut value = 0;
+        process_date(
+            scalar_value_fn(&years)?,
+            scalar_value_fn(&months)?,
+            scalar_value_fn(&days)?,
+            |days: i32| value = days,
+        )?;
+
+        ColumnarValue::Scalar(ScalarValue::Date32(Some(value)))
+    };
+
+    Ok(value)
+}
+
+fn process_date(
+    year: i32,
+    month: i32,
+    day: i32,
+    mut date_consumer_fn: impl FnMut(i32),
+) -> Result<()> {
+    let Ok(m) = u32::try_from(month) else {
+        return exec_err!("Month value '{month:?}' is out of range");
+    };
+    let Ok(d) = u32::try_from(day) else {

Review Comment:
   I am not quite sure what you are suggesting here @comphead  I think this code is moved from above 
   
   The fact that arrow uses `i32` for the subfields rather than `u32` is somewhat non ideal I agree, but this code I believe just follows the standard



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


Re: [PR] Optimize make_date (#9089) [arrow-datafusion]

Posted by "alamb (via GitHub)" <gi...@apache.org>.
alamb commented on code in PR #9600:
URL: https://github.com/apache/arrow-datafusion/pull/9600#discussion_r1526812147


##########
datafusion/physical-expr/src/datetime_expressions.rs:
##########
@@ -351,51 +317,77 @@ pub fn make_date(args: &[ColumnarValue]) -> Result<ColumnarValue> {
         Ok(*i)
     };
 
-    // For scalar only columns the operation is faster without using the PrimitiveArray
-    if is_scalar {
-        construct_date_fn(
-            &mut builder,
-            scalar_value_fn(&years)?,
-            scalar_value_fn(&months)?,
-            scalar_value_fn(&days)?,
-            unix_days_from_ce,
-        )?;
-    } else {
-        let to_primitive_array = |col: &ColumnarValue,
-                                  scalar_count: usize|
-         -> Result<PrimitiveArray<Int32Type>> {
+    let value = if let Some(array_size) = len {
+        let to_primitive_array_fn = |col: &ColumnarValue| -> PrimitiveArray<Int32Type> {
             match col {
-                ColumnarValue::Array(a) => Ok(a.as_primitive::<Int32Type>().to_owned()),
+                ColumnarValue::Array(a) => a.as_primitive::<Int32Type>().to_owned(),
                 _ => {
                     let v = scalar_value_fn(col).unwrap();
-                    Ok(PrimitiveArray::<Int32Type>::from_value(v, scalar_count))
+                    PrimitiveArray::<Int32Type>::from_value(v, array_size)
                 }
             }
         };
 
-        let years = to_primitive_array(&years, array_size).unwrap();
-        let months = to_primitive_array(&months, array_size).unwrap();
-        let days = to_primitive_array(&days, array_size).unwrap();
+        let years = to_primitive_array_fn(&years);
+        let months = to_primitive_array_fn(&months);
+        let days = to_primitive_array_fn(&days);
+
+        let mut builder: PrimitiveBuilder<Date32Type> =
+            PrimitiveArray::builder(array_size);
         for i in 0..array_size {
-            construct_date_fn(
-                &mut builder,
+            process_date(
                 years.value(i),
                 months.value(i),
                 days.value(i),
-                unix_days_from_ce,
+                |days: i32| builder.append_value(days),
             )?;
         }
-    }
 
-    let arr = builder.finish();
+        let arr = builder.finish();
 
-    if is_scalar {
-        // If all inputs are scalar, keeps output as scalar
-        Ok(ColumnarValue::Scalar(ScalarValue::Date32(Some(
-            arr.value(0),
-        ))))
+        ColumnarValue::Array(Arc::new(arr))
+    } else {
+        // For scalar only columns the operation is faster without using the PrimitiveArray.
+        // Also, keep the output as scalar since all inputs are scalar.
+        let mut value = 0;
+        process_date(
+            scalar_value_fn(&years)?,
+            scalar_value_fn(&months)?,
+            scalar_value_fn(&days)?,
+            |days: i32| value = days,
+        )?;
+
+        ColumnarValue::Scalar(ScalarValue::Date32(Some(value)))
+    };
+
+    Ok(value)
+}
+
+fn process_date(
+    year: i32,
+    month: i32,
+    day: i32,
+    mut date_consumer_fn: impl FnMut(i32),
+) -> Result<()> {
+    let Ok(m) = u32::try_from(month) else {
+        return exec_err!("Month value '{month:?}' is out of range");
+    };
+    let Ok(d) = u32::try_from(day) else {

Review Comment:
    I agree `u32` would be more natural. I think the `i32` is used because that is what the underlying APIs require



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org