You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2022/11/15 11:18:42 UTC
[arrow-datafusion] branch master updated: Parse nanoseconds for intervals (#4186)
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/master by this push:
new 8bfc0ea2e Parse nanoseconds for intervals (#4186)
8bfc0ea2e is described below
commit 8bfc0ea2e2b5040d0386b1d11c6817b0a8d20661
Author: Jeffrey <22...@users.noreply.github.com>
AuthorDate: Tue Nov 15 22:18:35 2022 +1100
Parse nanoseconds for intervals (#4186)
* Parse nanoseconds for intervals
* Formatting
Co-authored-by: Ruihang Xia <wa...@gmail.com>
* Empty commit
Co-authored-by: Ruihang Xia <wa...@gmail.com>
---
datafusion/common/src/parsers.rs | 143 +++++++++++++++++++++-----------------
datafusion/core/tests/sql/expr.rs | 6 +-
2 files changed, 81 insertions(+), 68 deletions(-)
diff --git a/datafusion/common/src/parsers.rs b/datafusion/common/src/parsers.rs
index 7b78c92bc..94ad5d86b 100644
--- a/datafusion/common/src/parsers.rs
+++ b/datafusion/common/src/parsers.rs
@@ -19,8 +19,8 @@
use crate::{DataFusionError, Result, ScalarValue};
use std::str::FromStr;
-const SECONDS_PER_HOUR: f32 = 3_600_f32;
-const MILLIS_PER_SECOND: f32 = 1_000_f32;
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_SECOND: f64 = 1_000_000_000_f64;
/// Parses a string with an interval like `'0.5 MONTH'` to an
/// appropriately typed [`ScalarValue`]
@@ -29,73 +29,74 @@ pub fn parse_interval(leading_field: &str, value: &str) -> Result<ScalarValue> {
// INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days
// INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours
let align_interval_parts =
- |month_part: f32, mut day_part: f32, mut milles_part: f32| -> (i32, i32, f32) {
+ |month_part: f64, mut day_part: f64, mut nanos_part: f64| -> (i64, i64, f64) {
// Convert fractional month to days, It's not supported by Arrow types, but anyway
- day_part += (month_part - (month_part as i32) as f32) * 30_f32;
+ day_part += (month_part - (month_part as i64) as f64) * 30_f64;
// Convert fractional days to hours
- milles_part += (day_part - ((day_part as i32) as f32))
- * 24_f32
+ nanos_part += (day_part - ((day_part as i64) as f64))
+ * 24_f64
* SECONDS_PER_HOUR
- * MILLIS_PER_SECOND;
+ * NANOS_PER_SECOND;
- (month_part as i32, day_part as i32, milles_part)
+ (month_part as i64, day_part as i64, nanos_part)
};
- let calculate_from_part =
- |interval_period_str: &str, interval_type: &str| -> Result<(i32, i32, f32)> {
- // @todo It's better to use Decimal in order to protect rounding errors
- // Wait https://github.com/apache/arrow/pull/9232
- let interval_period = match f32::from_str(interval_period_str) {
- Ok(n) => n,
- Err(_) => {
- return Err(DataFusionError::NotImplemented(format!(
- "Unsupported Interval Expression with value {:?}",
- value
- )));
- }
- };
-
- if interval_period > (i32::MAX as f32) {
+ let calculate_from_part = |interval_period_str: &str,
+ interval_type: &str|
+ -> Result<(i64, i64, f64)> {
+ // @todo It's better to use Decimal in order to protect rounding errors
+ // Wait https://github.com/apache/arrow/pull/9232
+ let interval_period = match f64::from_str(interval_period_str) {
+ Ok(n) => n,
+ Err(_) => {
return Err(DataFusionError::NotImplemented(format!(
- "Interval field value out of range: {:?}",
+ "Unsupported Interval Expression with value {:?}",
value
)));
}
+ };
- match interval_type.to_lowercase().as_str() {
- "century" | "centuries" => {
- Ok(align_interval_parts(interval_period * 1200_f32, 0.0, 0.0))
- }
- "decade" | "decades" => {
- Ok(align_interval_parts(interval_period * 120_f32, 0.0, 0.0))
- }
- "year" | "years" => {
- Ok(align_interval_parts(interval_period * 12_f32, 0.0, 0.0))
- }
- "month" | "months" => Ok(align_interval_parts(interval_period, 0.0, 0.0)),
- "week" | "weeks" => {
- Ok(align_interval_parts(0.0, interval_period * 7_f32, 0.0))
- }
- "day" | "days" => Ok(align_interval_parts(0.0, interval_period, 0.0)),
- "hour" | "hours" => {
- Ok((0, 0, interval_period * SECONDS_PER_HOUR * MILLIS_PER_SECOND))
- }
- "minute" | "minutes" => {
- Ok((0, 0, interval_period * 60_f32 * MILLIS_PER_SECOND))
- }
- "second" | "seconds" => Ok((0, 0, interval_period * MILLIS_PER_SECOND)),
- "millisecond" | "milliseconds" => Ok((0, 0, interval_period)),
- _ => Err(DataFusionError::NotImplemented(format!(
- "Invalid input syntax for type interval: {:?}",
- value
- ))),
+ if interval_period > (i64::MAX as f64) {
+ return Err(DataFusionError::NotImplemented(format!(
+ "Interval field value out of range: {:?}",
+ value
+ )));
+ }
+
+ match interval_type.to_lowercase().as_str() {
+ "century" | "centuries" => {
+ Ok(align_interval_parts(interval_period * 1200_f64, 0.0, 0.0))
}
- };
+ "decade" | "decades" => {
+ Ok(align_interval_parts(interval_period * 120_f64, 0.0, 0.0))
+ }
+ "year" | "years" => {
+ Ok(align_interval_parts(interval_period * 12_f64, 0.0, 0.0))
+ }
+ "month" | "months" => Ok(align_interval_parts(interval_period, 0.0, 0.0)),
+ "week" | "weeks" => {
+ Ok(align_interval_parts(0.0, interval_period * 7_f64, 0.0))
+ }
+ "day" | "days" => Ok(align_interval_parts(0.0, interval_period, 0.0)),
+ "hour" | "hours" => {
+ Ok((0, 0, interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND))
+ }
+ "minute" | "minutes" => {
+ Ok((0, 0, interval_period * 60_f64 * NANOS_PER_SECOND))
+ }
+ "second" | "seconds" => Ok((0, 0, interval_period * NANOS_PER_SECOND)),
+ "millisecond" | "milliseconds" => Ok((0, 0, interval_period * 1_000_000f64)),
+ _ => Err(DataFusionError::NotImplemented(format!(
+ "Invalid input syntax for type interval: {:?}",
+ value
+ ))),
+ }
+ };
let mut result_month: i64 = 0;
let mut result_days: i64 = 0;
- let mut result_millis: i64 = 0;
+ let mut result_nanos: i128 = 0;
let mut parts = value.split_whitespace();
@@ -107,7 +108,7 @@ pub fn parse_interval(leading_field: &str, value: &str) -> Result<ScalarValue> {
let unit = parts.next().unwrap_or(leading_field);
- let (diff_month, diff_days, diff_millis) =
+ let (diff_month, diff_days, diff_nanos) =
calculate_from_part(interval_period_str.unwrap(), unit)?;
result_month += diff_month as i64;
@@ -128,9 +129,9 @@ pub fn parse_interval(leading_field: &str, value: &str) -> Result<ScalarValue> {
)));
}
- result_millis += diff_millis as i64;
+ result_nanos += diff_nanos as i128;
- if result_millis > (i32::MAX as i64) {
+ if result_nanos > (i64::MAX as i128) {
return Err(DataFusionError::NotImplemented(format!(
"Interval field value out of range: {:?}",
value
@@ -142,11 +143,13 @@ pub fn parse_interval(leading_field: &str, value: &str) -> Result<ScalarValue> {
// 1 day is not 24 hours because timezones, 1 year != 365/364! 30 days != 1 month
// The true way to store and calculate intervals is to store it as it defined
// It's why we there are 3 different interval types in Arrow
- if result_month != 0 && (result_days != 0 || result_millis != 0) {
- let result: i128 = ((result_month as i128) << 96)
- | ((result_days as i128) << 64)
- // IntervalMonthDayNano uses nanos, but IntervalDayTime uses milles
- | ((result_millis * 1_000_000_i64) as i128);
+
+ // If have a unit smaller than milliseconds then must use IntervalMonthDayNano
+ if (result_nanos % 1_000_000 != 0)
+ || (result_month != 0 && (result_days != 0 || result_nanos != 0))
+ {
+ let result: i128 =
+ ((result_month as i128) << 96) | ((result_days as i128) << 64) | result_nanos;
return Ok(ScalarValue::IntervalMonthDayNano(Some(result)));
}
@@ -156,7 +159,8 @@ pub fn parse_interval(leading_field: &str, value: &str) -> Result<ScalarValue> {
return Ok(ScalarValue::IntervalYearMonth(Some(result_month as i32)));
}
- let result: i64 = (result_days << 32) | result_millis;
+ // IntervalMonthDayNano uses nanos, but IntervalDayTime uses millis
+ let result: i64 = (result_days << 32) | (result_nanos as i64 / 1_000_000);
Ok(ScalarValue::IntervalDayTime(Some(result)))
}
@@ -165,6 +169,8 @@ mod test {
use super::*;
use crate::assert_contains;
+ const MILLIS_PER_SECOND: f64 = 1_000_f64;
+
#[test]
fn test_parse_ym() {
assert_eq!(
@@ -212,13 +218,20 @@ mod test {
}
#[test]
- // https://github.com/apache/arrow-rs/pull/2235
- #[ignore]
fn test_mdn() {
- // these should be the same, I think
assert_eq!(
parse_interval("months", "1 year 25 millisecond").unwrap(),
ScalarValue::new_interval_mdn(12, 0, 25 * 1_000_000)
);
+
+ assert_eq!(
+ parse_interval("months", "1 year 1 day 0.000000001 seconds").unwrap(),
+ ScalarValue::new_interval_mdn(12, 1, 1)
+ );
+
+ assert_eq!(
+ parse_interval("months", "1 year 1 day 0.1 milliseconds").unwrap(),
+ ScalarValue::new_interval_mdn(12, 1, 1_00 * 1_000)
+ );
}
}
diff --git a/datafusion/core/tests/sql/expr.rs b/datafusion/core/tests/sql/expr.rs
index 8251a5546..f82be2a5b 100644
--- a/datafusion/core/tests/sql/expr.rs
+++ b/datafusion/core/tests/sql/expr.rs
@@ -917,11 +917,11 @@ async fn test_interval_expressions() -> Result<()> {
);
test_expression!(
"interval '0.499 day'",
- "0 years 0 mons 0 days 11 hours 58 mins 33.596 secs"
+ "0 years 0 mons 0 days 11 hours 58 mins 33.600 secs"
);
test_expression!(
"interval '0.4999 day'",
- "0 years 0 mons 0 days 11 hours 59 mins 51.364 secs"
+ "0 years 0 mons 0 days 11 hours 59 mins 51.360 secs"
);
test_expression!(
"interval '0.49999 day'",
@@ -929,7 +929,7 @@ async fn test_interval_expressions() -> Result<()> {
);
test_expression!(
"interval '0.49999999999 day'",
- "0 years 0 mons 0 days 12 hours 0 mins 0.00 secs"
+ "0 years 0 mons 0 days 11 hours 59 mins 59.999999136 secs"
);
test_expression!(
"interval '5 day'",