You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/11/15 11:18:42 UTC

[arrow-datafusion] branch master updated: Parse nanoseconds for intervals (#4186)

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/master by this push:
     new 8bfc0ea2e Parse nanoseconds for intervals (#4186)
8bfc0ea2e is described below

commit 8bfc0ea2e2b5040d0386b1d11c6817b0a8d20661
Author: Jeffrey <22...@users.noreply.github.com>
AuthorDate: Tue Nov 15 22:18:35 2022 +1100

    Parse nanoseconds for intervals (#4186)
    
    * Parse nanoseconds for intervals
    
    * Formatting
    
    Co-authored-by: Ruihang Xia <wa...@gmail.com>
    
    * Empty commit
    
    Co-authored-by: Ruihang Xia <wa...@gmail.com>
---
 datafusion/common/src/parsers.rs  | 143 +++++++++++++++++++++-----------------
 datafusion/core/tests/sql/expr.rs |   6 +-
 2 files changed, 81 insertions(+), 68 deletions(-)

diff --git a/datafusion/common/src/parsers.rs b/datafusion/common/src/parsers.rs
index 7b78c92bc..94ad5d86b 100644
--- a/datafusion/common/src/parsers.rs
+++ b/datafusion/common/src/parsers.rs
@@ -19,8 +19,8 @@
 use crate::{DataFusionError, Result, ScalarValue};
 use std::str::FromStr;
 
-const SECONDS_PER_HOUR: f32 = 3_600_f32;
-const MILLIS_PER_SECOND: f32 = 1_000_f32;
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_SECOND: f64 = 1_000_000_000_f64;
 
 /// Parses a string with an interval like `'0.5 MONTH'` to an
 /// appropriately typed [`ScalarValue`]
@@ -29,73 +29,74 @@ pub fn parse_interval(leading_field: &str, value: &str) -> Result<ScalarValue> {
     // INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days
     // INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours
     let align_interval_parts =
-        |month_part: f32, mut day_part: f32, mut milles_part: f32| -> (i32, i32, f32) {
+        |month_part: f64, mut day_part: f64, mut nanos_part: f64| -> (i64, i64, f64) {
             // Convert fractional month to days, It's not supported by Arrow types, but anyway
-            day_part += (month_part - (month_part as i32) as f32) * 30_f32;
+            day_part += (month_part - (month_part as i64) as f64) * 30_f64;
 
             // Convert fractional days to hours
-            milles_part += (day_part - ((day_part as i32) as f32))
-                * 24_f32
+            nanos_part += (day_part - ((day_part as i64) as f64))
+                * 24_f64
                 * SECONDS_PER_HOUR
-                * MILLIS_PER_SECOND;
+                * NANOS_PER_SECOND;
 
-            (month_part as i32, day_part as i32, milles_part)
+            (month_part as i64, day_part as i64, nanos_part)
         };
 
-    let calculate_from_part =
-        |interval_period_str: &str, interval_type: &str| -> Result<(i32, i32, f32)> {
-            // @todo It's better to use Decimal in order to protect rounding errors
-            // Wait https://github.com/apache/arrow/pull/9232
-            let interval_period = match f32::from_str(interval_period_str) {
-                Ok(n) => n,
-                Err(_) => {
-                    return Err(DataFusionError::NotImplemented(format!(
-                        "Unsupported Interval Expression with value {:?}",
-                        value
-                    )));
-                }
-            };
-
-            if interval_period > (i32::MAX as f32) {
+    let calculate_from_part = |interval_period_str: &str,
+                               interval_type: &str|
+     -> Result<(i64, i64, f64)> {
+        // @todo It's better to use Decimal in order to protect rounding errors
+        // Wait https://github.com/apache/arrow/pull/9232
+        let interval_period = match f64::from_str(interval_period_str) {
+            Ok(n) => n,
+            Err(_) => {
                 return Err(DataFusionError::NotImplemented(format!(
-                    "Interval field value out of range: {:?}",
+                    "Unsupported Interval Expression with value {:?}",
                     value
                 )));
             }
+        };
 
-            match interval_type.to_lowercase().as_str() {
-                "century" | "centuries" => {
-                    Ok(align_interval_parts(interval_period * 1200_f32, 0.0, 0.0))
-                }
-                "decade" | "decades" => {
-                    Ok(align_interval_parts(interval_period * 120_f32, 0.0, 0.0))
-                }
-                "year" | "years" => {
-                    Ok(align_interval_parts(interval_period * 12_f32, 0.0, 0.0))
-                }
-                "month" | "months" => Ok(align_interval_parts(interval_period, 0.0, 0.0)),
-                "week" | "weeks" => {
-                    Ok(align_interval_parts(0.0, interval_period * 7_f32, 0.0))
-                }
-                "day" | "days" => Ok(align_interval_parts(0.0, interval_period, 0.0)),
-                "hour" | "hours" => {
-                    Ok((0, 0, interval_period * SECONDS_PER_HOUR * MILLIS_PER_SECOND))
-                }
-                "minute" | "minutes" => {
-                    Ok((0, 0, interval_period * 60_f32 * MILLIS_PER_SECOND))
-                }
-                "second" | "seconds" => Ok((0, 0, interval_period * MILLIS_PER_SECOND)),
-                "millisecond" | "milliseconds" => Ok((0, 0, interval_period)),
-                _ => Err(DataFusionError::NotImplemented(format!(
-                    "Invalid input syntax for type interval: {:?}",
-                    value
-                ))),
+        if interval_period > (i64::MAX as f64) {
+            return Err(DataFusionError::NotImplemented(format!(
+                "Interval field value out of range: {:?}",
+                value
+            )));
+        }
+
+        match interval_type.to_lowercase().as_str() {
+            "century" | "centuries" => {
+                Ok(align_interval_parts(interval_period * 1200_f64, 0.0, 0.0))
             }
-        };
+            "decade" | "decades" => {
+                Ok(align_interval_parts(interval_period * 120_f64, 0.0, 0.0))
+            }
+            "year" | "years" => {
+                Ok(align_interval_parts(interval_period * 12_f64, 0.0, 0.0))
+            }
+            "month" | "months" => Ok(align_interval_parts(interval_period, 0.0, 0.0)),
+            "week" | "weeks" => {
+                Ok(align_interval_parts(0.0, interval_period * 7_f64, 0.0))
+            }
+            "day" | "days" => Ok(align_interval_parts(0.0, interval_period, 0.0)),
+            "hour" | "hours" => {
+                Ok((0, 0, interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND))
+            }
+            "minute" | "minutes" => {
+                Ok((0, 0, interval_period * 60_f64 * NANOS_PER_SECOND))
+            }
+            "second" | "seconds" => Ok((0, 0, interval_period * NANOS_PER_SECOND)),
+            "millisecond" | "milliseconds" => Ok((0, 0, interval_period * 1_000_000f64)),
+            _ => Err(DataFusionError::NotImplemented(format!(
+                "Invalid input syntax for type interval: {:?}",
+                value
+            ))),
+        }
+    };
 
     let mut result_month: i64 = 0;
     let mut result_days: i64 = 0;
-    let mut result_millis: i64 = 0;
+    let mut result_nanos: i128 = 0;
 
     let mut parts = value.split_whitespace();
 
@@ -107,7 +108,7 @@ pub fn parse_interval(leading_field: &str, value: &str) -> Result<ScalarValue> {
 
         let unit = parts.next().unwrap_or(leading_field);
 
-        let (diff_month, diff_days, diff_millis) =
+        let (diff_month, diff_days, diff_nanos) =
             calculate_from_part(interval_period_str.unwrap(), unit)?;
 
         result_month += diff_month as i64;
@@ -128,9 +129,9 @@ pub fn parse_interval(leading_field: &str, value: &str) -> Result<ScalarValue> {
             )));
         }
 
-        result_millis += diff_millis as i64;
+        result_nanos += diff_nanos as i128;
 
-        if result_millis > (i32::MAX as i64) {
+        if result_nanos > (i64::MAX as i128) {
             return Err(DataFusionError::NotImplemented(format!(
                 "Interval field value out of range: {:?}",
                 value
@@ -142,11 +143,13 @@ pub fn parse_interval(leading_field: &str, value: &str) -> Result<ScalarValue> {
     // 1 day is not 24 hours because timezones, 1 year != 365/364! 30 days != 1 month
     // The true way to store and calculate intervals is to store it as it defined
     // It's why we there are 3 different interval types in Arrow
-    if result_month != 0 && (result_days != 0 || result_millis != 0) {
-        let result: i128 = ((result_month as i128) << 96)
-            | ((result_days as i128) << 64)
-            // IntervalMonthDayNano uses nanos, but IntervalDayTime uses milles
-            | ((result_millis * 1_000_000_i64) as i128);
+
+    // If have a unit smaller than milliseconds then must use IntervalMonthDayNano
+    if (result_nanos % 1_000_000 != 0)
+        || (result_month != 0 && (result_days != 0 || result_nanos != 0))
+    {
+        let result: i128 =
+            ((result_month as i128) << 96) | ((result_days as i128) << 64) | result_nanos;
 
         return Ok(ScalarValue::IntervalMonthDayNano(Some(result)));
     }
@@ -156,7 +159,8 @@ pub fn parse_interval(leading_field: &str, value: &str) -> Result<ScalarValue> {
         return Ok(ScalarValue::IntervalYearMonth(Some(result_month as i32)));
     }
 
-    let result: i64 = (result_days << 32) | result_millis;
+    // IntervalMonthDayNano uses nanos, but IntervalDayTime uses millis
+    let result: i64 = (result_days << 32) | (result_nanos as i64 / 1_000_000);
     Ok(ScalarValue::IntervalDayTime(Some(result)))
 }
 
@@ -165,6 +169,8 @@ mod test {
     use super::*;
     use crate::assert_contains;
 
+    const MILLIS_PER_SECOND: f64 = 1_000_f64;
+
     #[test]
     fn test_parse_ym() {
         assert_eq!(
@@ -212,13 +218,20 @@ mod test {
     }
 
     #[test]
-    // https://github.com/apache/arrow-rs/pull/2235
-    #[ignore]
     fn test_mdn() {
-        // these should be the same, I think
         assert_eq!(
             parse_interval("months", "1 year 25 millisecond").unwrap(),
             ScalarValue::new_interval_mdn(12, 0, 25 * 1_000_000)
         );
+
+        assert_eq!(
+            parse_interval("months", "1 year 1 day 0.000000001 seconds").unwrap(),
+            ScalarValue::new_interval_mdn(12, 1, 1)
+        );
+
+        assert_eq!(
+            parse_interval("months", "1 year 1 day 0.1 milliseconds").unwrap(),
+            ScalarValue::new_interval_mdn(12, 1, 1_00 * 1_000)
+        );
     }
 }
diff --git a/datafusion/core/tests/sql/expr.rs b/datafusion/core/tests/sql/expr.rs
index 8251a5546..f82be2a5b 100644
--- a/datafusion/core/tests/sql/expr.rs
+++ b/datafusion/core/tests/sql/expr.rs
@@ -917,11 +917,11 @@ async fn test_interval_expressions() -> Result<()> {
     );
     test_expression!(
         "interval '0.499 day'",
-        "0 years 0 mons 0 days 11 hours 58 mins 33.596 secs"
+        "0 years 0 mons 0 days 11 hours 58 mins 33.600 secs"
     );
     test_expression!(
         "interval '0.4999 day'",
-        "0 years 0 mons 0 days 11 hours 59 mins 51.364 secs"
+        "0 years 0 mons 0 days 11 hours 59 mins 51.360 secs"
     );
     test_expression!(
         "interval '0.49999 day'",
@@ -929,7 +929,7 @@ async fn test_interval_expressions() -> Result<()> {
     );
     test_expression!(
         "interval '0.49999999999 day'",
-        "0 years 0 mons 0 days 12 hours 0 mins 0.00 secs"
+        "0 years 0 mons 0 days 11 hours 59 mins 59.999999136 secs"
     );
     test_expression!(
         "interval '5 day'",