You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "doki23 (via GitHub)" <gi...@apache.org> on 2023/02/25 08:34:44 UTC

[GitHub] [arrow-rs] doki23 opened a new pull request, #3762: Cast string interval

doki23 opened a new pull request, #3762:
URL: https://github.com/apache/arrow-rs/pull/3762

   # Which issue does this PR close?
   Closes #3643.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] tustvold commented on a diff in pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "tustvold (via GitHub)" <gi...@apache.org>.
tustvold commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1126763320


##########
arrow-cast/src/parse.rs:
##########
@@ -445,6 +446,231 @@ impl Parser for Date64Type {
     }
 }
 
+pub(crate) fn parse_interval_year_month(
+    value: &str,
+) -> Result<<IntervalYearMonthType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("years", value)?;
+    if result_days != 0 || result_nanos != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalYearMonth because the value isn't multiple of months"
+        )));
+    }
+    Ok(IntervalYearMonthType::make_value(0, result_months))
+}
+
+pub(crate) fn parse_interval_day_time(
+    value: &str,
+) -> Result<<IntervalDayTimeType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?;
+    if result_nanos % 1_000_000 != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds"
+        )));
+    }
+    result_days += result_months * 30;
+    Ok(IntervalDayTimeType::make_value(
+        result_days,
+        (result_nanos / 1_000_000) as i32,
+    ))
+}
+
+pub(crate) fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("months", value)?;
+    Ok(IntervalMonthDayNanoType::make_value(
+        result_months,
+        result_days,
+        result_nanos,
+    ))
+}
+
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_MILLIS: f64 = 1_000_000_f64;
+const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS;
+const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND;
+const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE;
+const NANOS_PER_DAY: f64 = 24_f64 * NANOS_PER_HOUR;
+
+#[derive(Clone, Copy)]
+#[repr(u16)]
+enum IntervalType {
+    Century = 0b_00_0000_0001,
+    Decade = 0b_00_0000_0010,
+    Year = 0b_00_0000_0100,
+    Month = 0b_00_0000_1000,
+    Week = 0b_00_0001_0000,
+    Day = 0b_00_0010_0000,
+    Hour = 0b_00_0100_0000,
+    Minute = 0b_00_1000_0000,
+    Second = 0b_01_0000_0000,
+    Millisecond = 0b_10_0000_0000,
+}
+
+impl FromStr for IntervalType {
+    type Err = ArrowError;
+
+    fn from_str(s: &str) -> Result<Self, ArrowError> {
+        match s.to_lowercase().as_str() {
+            "century" | "centuries" => Ok(Self::Century),
+            "decade" | "decades" => Ok(Self::Decade),
+            "year" | "years" => Ok(Self::Year),
+            "month" | "months" => Ok(Self::Month),
+            "week" | "weeks" => Ok(Self::Week),
+            "day" | "days" => Ok(Self::Day),
+            "hour" | "hours" => Ok(Self::Hour),
+            "minute" | "minutes" => Ok(Self::Minute),
+            "second" | "seconds" => Ok(Self::Second),
+            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
+            _ => Err(ArrowError::NotYetImplemented(format!(
+                "Unknown interval type: {s}"
+            ))),
+        }
+    }
+}
+
+pub type MonthDayNano = (i32, i32, i64);
+
+/// parse string value to a triple of aligned months, days, nanos
+pub fn parse_interval(
+    leading_field: &str,
+    value: &str,
+) -> Result<MonthDayNano, ArrowError> {
+    let mut used_interval_types = 0;
+
+    let mut calculate_from_part = |interval_period_str: &str,
+                                   interval_type: &str|
+     -> Result<(i64, i64, f64), ArrowError> {
+        // @todo It's better to use Decimal in order to protect rounding errors
+        // Wait https://github.com/apache/arrow/pull/9232
+        let interval_period = match f64::from_str(interval_period_str) {
+            Ok(n) => n,
+            Err(_) => {
+                return Err(ArrowError::NotYetImplemented(format!(
+                    "Unsupported Interval Expression with value {value:?}"
+                )));
+            }
+        };
+
+        if interval_period > (i64::MAX as f64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        let it = IntervalType::from_str(interval_type).map_err(|_| {
+            ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}"
+            ))
+        })?;
+
+        // Disallow duplicate interval types
+        if used_interval_types & (it as u16) != 0 {
+            return Err(ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}. Repeated type '{interval_type}'"
+            )));
+        } else {
+            used_interval_types |= it as u16;
+        }
+
+        match it {
+            IntervalType::Century => {
+                Ok(align_interval_parts(interval_period * 1200_f64, 0.0, 0.0))
+            }
+            IntervalType::Decade => {
+                Ok(align_interval_parts(interval_period * 120_f64, 0.0, 0.0))
+            }
+            IntervalType::Year => {
+                Ok(align_interval_parts(interval_period * 12_f64, 0.0, 0.0))
+            }
+            IntervalType::Month => Ok(align_interval_parts(interval_period, 0.0, 0.0)),
+            IntervalType::Week => {
+                Ok(align_interval_parts(0.0, interval_period * 7_f64, 0.0))
+            }
+            IntervalType::Day => Ok(align_interval_parts(0.0, interval_period, 0.0)),
+            IntervalType::Hour => {
+                Ok((0, 0, interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND))
+            }
+            IntervalType::Minute => {
+                Ok((0, 0, interval_period * 60_f64 * NANOS_PER_SECOND))
+            }
+            IntervalType::Second => Ok((0, 0, interval_period * NANOS_PER_SECOND)),
+            IntervalType::Millisecond => Ok((0, 0, interval_period * 1_000_000f64)),
+        }
+    };
+
+    let mut result_month: i64 = 0;
+    let mut result_days: i64 = 0;
+    let mut result_nanos: i128 = 0;
+
+    let mut parts = value.split_whitespace();
+
+    loop {
+        let interval_period_str = parts.next();
+        if interval_period_str.is_none() {
+            break;
+        }
+
+        let unit = parts.next().unwrap_or(leading_field);
+
+        let (diff_month, diff_days, diff_nanos) =
+            calculate_from_part(interval_period_str.unwrap(), unit)?;
+
+        result_month += diff_month;
+
+        if result_month > (i32::MAX as i64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        result_days += diff_days;
+
+        if result_days > (i32::MAX as i64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        result_nanos += diff_nanos as i128;
+
+        if result_nanos > (i64::MAX as i128) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+    }
+
+    Ok((result_month as i32, result_days as i32, result_nanos as i64))
+}
+
+/// We are storing parts as integers, it's why we need to align parts fractional
+/// INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days
+/// INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours
+/// INTERVAL '30 DAYS' = 1 month
+fn align_interval_parts(
+    mut month_part: f64,
+    mut day_part: f64,
+    mut nanos_part: f64,
+) -> (i64, i64, f64) {
+    // Convert fractional month to days, It's not supported by Arrow types, but anyway
+    day_part += (month_part - (month_part as i64) as f64) * 30_f64;
+
+    // Convert fractional days to hours
+    nanos_part += (day_part - ((day_part as i64) as f64))
+        * 24_f64
+        * SECONDS_PER_HOUR
+        * NANOS_PER_SECOND;
+
+    // Convert to higher units as much as possible

Review Comment:
   I don't think this is correct, the number of days in a month is not fixed. The postgres docs make an exception for the case of fractional dates, but I don't think we should do this in general



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] tustvold merged pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "tustvold (via GitHub)" <gi...@apache.org>.
tustvold merged PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] doki23 commented on a diff in pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "doki23 (via GitHub)" <gi...@apache.org>.
doki23 commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1127332823


##########
arrow-cast/src/parse.rs:
##########
@@ -445,6 +446,237 @@ impl Parser for Date64Type {
     }
 }
 
+pub fn parse_interval_year_month(
+    value: &str,
+) -> Result<<IntervalYearMonthType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("years", value)?;
+    if result_days != 0 || result_nanos != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalYearMonth because the value isn't multiple of months"
+        )));
+    }
+    Ok(IntervalYearMonthType::make_value(0, result_months))
+}
+
+pub fn parse_interval_day_time(
+    value: &str,
+) -> Result<<IntervalDayTimeType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?;
+    if result_nanos % 1_000_000 != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds"
+        )));
+    }
+    result_days += result_months * 30;
+    Ok(IntervalDayTimeType::make_value(
+        result_days,
+        (result_nanos / 1_000_000) as i32,
+    ))
+}
+
+pub fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("months", value)?;
+    Ok(IntervalMonthDayNanoType::make_value(
+        result_months,
+        result_days,
+        result_nanos,
+    ))
+}
+
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_MILLIS: f64 = 1_000_000_f64;
+const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS;
+const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND;
+const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE;
+const NANOS_PER_DAY: f64 = 24_f64 * NANOS_PER_HOUR;
+
+#[derive(Clone, Copy)]
+#[repr(u16)]
+enum IntervalType {
+    Century = 0b_00_0000_0001,
+    Decade = 0b_00_0000_0010,
+    Year = 0b_00_0000_0100,
+    Month = 0b_00_0000_1000,
+    Week = 0b_00_0001_0000,
+    Day = 0b_00_0010_0000,
+    Hour = 0b_00_0100_0000,
+    Minute = 0b_00_1000_0000,
+    Second = 0b_01_0000_0000,
+    Millisecond = 0b_10_0000_0000,
+}
+
+impl FromStr for IntervalType {
+    type Err = ArrowError;
+
+    fn from_str(s: &str) -> Result<Self, ArrowError> {
+        match s.to_lowercase().as_str() {
+            "century" | "centuries" => Ok(Self::Century),
+            "decade" | "decades" => Ok(Self::Decade),
+            "year" | "years" => Ok(Self::Year),
+            "month" | "months" => Ok(Self::Month),
+            "week" | "weeks" => Ok(Self::Week),
+            "day" | "days" => Ok(Self::Day),
+            "hour" | "hours" => Ok(Self::Hour),
+            "minute" | "minutes" => Ok(Self::Minute),
+            "second" | "seconds" => Ok(Self::Second),
+            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
+            _ => Err(ArrowError::NotYetImplemented(format!(
+                "Unknown interval type: {s}"
+            ))),
+        }
+    }
+}
+
+pub type MonthDayNano = (i32, i32, i64);
+
+/// parse string value to a triple of aligned months, days, nanos.
+/// Fractional units must be spilled to smaller units.
+/// Fractional parts of units greater than months are rounded to be an integer number of months,
+/// e.g. '1.5 years' becomes '12 mons + 6 mons', returns (18, 0, 0)
+/// Fractional parts of months, weeks, days, hours, minutes, seconds and milliseconds are computed
+/// to be an integer number of days and nanoseconds, assuming 30 days per month and 24 hours per day,
+/// e.g., '1.75 months' becomes '1 mon + 22 days + 12 hours', returns (1, 22, 12 * `NANOS_PER_HOUR`)
+/// leading field is the default unit. e.g. leading field is `second`, `1` = `1 second`
+fn parse_interval(leading_field: &str, value: &str) -> Result<MonthDayNano, ArrowError> {
+    let mut used_interval_types = 0;
+
+    let mut calculate_from_part = |interval_period_str: &str,
+                                   interval_type: &str|
+     -> Result<(i32, i32, i64), ArrowError> {
+        // @todo It's better to use Decimal in order to protect rounding errors
+        // Wait https://github.com/apache/arrow/pull/9232
+        let interval_period = match f64::from_str(interval_period_str) {
+            Ok(n) => n,
+            Err(_) => {
+                return Err(ArrowError::NotYetImplemented(format!(
+                    "Unsupported Interval Expression with value {value:?}"
+                )));
+            }
+        };
+
+        if interval_period > (i64::MAX as f64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        let it = IntervalType::from_str(interval_type).map_err(|_| {
+            ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}"
+            ))
+        })?;
+
+        // Disallow duplicate interval types
+        if used_interval_types & (it as u16) != 0 {
+            return Err(ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}. Repeated type '{interval_type}'"
+            )));
+        } else {
+            used_interval_types |= it as u16;
+        }
+
+        match it {
+            IntervalType::Century => {
+                align_interval_parts(interval_period * 1200_f64, 0.0, 0.0)
+            }
+            IntervalType::Decade => {
+                align_interval_parts(interval_period * 120_f64, 0.0, 0.0)
+            }
+            IntervalType::Year => {
+                align_interval_parts(interval_period * 12_f64, 0.0, 0.0)
+            }
+            IntervalType::Month => align_interval_parts(interval_period, 0.0, 0.0),
+            IntervalType::Week => align_interval_parts(0.0, interval_period * 7_f64, 0.0),
+            IntervalType::Day => align_interval_parts(0.0, interval_period, 0.0),
+            IntervalType::Hour => Ok((
+                0,
+                0,
+                (interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND) as i64,
+            )),
+            IntervalType::Minute => {
+                Ok((0, 0, (interval_period * 60_f64 * NANOS_PER_SECOND) as i64))
+            }
+            IntervalType::Second => {
+                Ok((0, 0, (interval_period * NANOS_PER_SECOND) as i64))
+            }
+            IntervalType::Millisecond => {
+                Ok((0, 0, (interval_period * 1_000_000f64) as i64))
+            }
+        }
+    };
+
+    let mut result_month: i32 = 0;
+    let mut result_days: i32 = 0;
+    let mut result_nanos: i64 = 0;
+
+    let mut parts = value.split_whitespace();
+
+    while let Some(interval_period_str) = parts.next() {
+        let unit = parts.next().unwrap_or(leading_field);
+
+        let (diff_month, diff_days, diff_nanos) =
+            calculate_from_part(interval_period_str, unit)?;
+
+        result_month =
+            result_month
+                .checked_add(diff_month)
+                .ok_or(ArrowError::ParseError(format!(
+                    "Interval field value out of range: {value:?}"
+                )))?;
+
+        result_days =
+            result_days
+                .checked_add(diff_days)
+                .ok_or(ArrowError::ParseError(format!(
+                    "Interval field value out of range: {value:?}"
+                )))?;
+
+        result_nanos =
+            result_nanos
+                .checked_add(diff_nanos)
+                .ok_or(ArrowError::ParseError(format!(
+                    "Interval field value out of range: {value:?}"
+                )))?;
+    }
+
+    Ok((result_month, result_days, result_nanos))
+}
+
+/// We are storing parts as integers, it's why we need to align parts fractional
+/// INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days
+/// INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours
+/// INTERVAL '30 DAYS' = 1 MONTH
+fn align_interval_parts(
+    mut month_part: f64,
+    mut day_part: f64,
+    mut nanos_part: f64,
+) -> Result<(i32, i32, i64), ArrowError> {
+    // Convert fractional month to days, It's not supported by Arrow types, but anyway
+    day_part += (month_part - (month_part as i64) as f64) * 30_f64;
+
+    // Convert fractional days to hours
+    nanos_part += (day_part - ((day_part as i64) as f64))
+        * 24_f64
+        * SECONDS_PER_HOUR
+        * NANOS_PER_SECOND;
+
+    // Convert to higher units as much as possible
+    day_part += ((nanos_part as i64) / (NANOS_PER_DAY as i64)) as f64;
+    month_part += ((day_part as i64) / 30_i64) as f64;
+    nanos_part %= NANOS_PER_DAY;
+    day_part %= 30_f64;

Review Comment:
   I agree. But it will affect `parse_interval_year_month` -- we can not parse `30 days` to `IntervalYearMonthType`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] doki23 commented on a diff in pull request #3762: Cast string to interval

Posted by "doki23 (via GitHub)" <gi...@apache.org>.
doki23 commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1123998579


##########
arrow-cast/src/parse.rs:
##########
@@ -445,9 +446,229 @@ impl Parser for Date64Type {
     }
 }
 
+pub(crate) fn parse_interval_year_month(
+    value: &str,
+) -> Result<<IntervalYearMonthType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("years", value)?;
+    if result_days != 0 || result_nanos != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast ${value} to IntervalYearMonth because the value isn't multiple of months"
+        )));
+    }
+    Ok(IntervalYearMonthType::make_value(0, result_months))
+}
+
+pub(crate) fn parse_interval_day_time(
+    value: &str,
+) -> Result<<IntervalDayTimeType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?;
+    if result_nanos % 1_000_000 != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast ${value} to IntervalDayTime because the nanos part isn't multiple of milliseconds"
+        )));
+    }
+    result_days += result_months * 30;
+    Ok(IntervalDayTimeType::make_value(
+        result_days,
+        (result_nanos / 1_000_000) as i32,
+    ))
+}
+
+pub(crate) fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("months", value)?;
+    Ok(IntervalMonthDayNanoType::make_value(
+        result_months,
+        result_days,
+        result_nanos,
+    ))
+}
+
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_MILLIS: f64 = 1_000_000_f64;
+const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS;
+#[cfg(test)]
+const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND;
+#[cfg(test)]
+const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE;
+
+#[derive(Clone, Copy)]
+#[repr(u16)]
+enum IntervalType {
+    Century = 0b_00_0000_0001,
+    Decade = 0b_00_0000_0010,
+    Year = 0b_00_0000_0100,
+    Month = 0b_00_0000_1000,
+    Week = 0b_00_0001_0000,
+    Day = 0b_00_0010_0000,
+    Hour = 0b_00_0100_0000,
+    Minute = 0b_00_1000_0000,
+    Second = 0b_01_0000_0000,
+    Millisecond = 0b_10_0000_0000,
+}
+
+impl FromStr for IntervalType {
+    type Err = ArrowError;
+
+    fn from_str(s: &str) -> Result<Self, ArrowError> {
+        match s.to_lowercase().as_str() {
+            "century" | "centuries" => Ok(Self::Century),
+            "decade" | "decades" => Ok(Self::Decade),
+            "year" | "years" => Ok(Self::Year),
+            "month" | "months" => Ok(Self::Month),
+            "week" | "weeks" => Ok(Self::Week),
+            "day" | "days" => Ok(Self::Day),
+            "hour" | "hours" => Ok(Self::Hour),
+            "minute" | "minutes" => Ok(Self::Minute),
+            "second" | "seconds" => Ok(Self::Second),
+            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
+            _ => Err(ArrowError::NotYetImplemented(format!(
+                "Unknown interval type: {s}"
+            ))),
+        }
+    }
+}
+
+pub type MonthDayNano = (i32, i32, i64);
+
+/// parse string value to a triple of aligned months, days, nanos
+pub fn parse_interval(
+    leading_field: &str,
+    value: &str,
+) -> Result<MonthDayNano, ArrowError> {
+    let mut used_interval_types = 0;
+
+    let mut calculate_from_part = |interval_period_str: &str,
+                                   interval_type: &str|
+     -> Result<(i64, i64, f64), ArrowError> {
+        // @todo It's better to use Decimal in order to protect rounding errors
+        // Wait https://github.com/apache/arrow/pull/9232
+        let interval_period = match f64::from_str(interval_period_str) {
+            Ok(n) => n,
+            Err(_) => {
+                return Err(ArrowError::NotYetImplemented(format!(
+                    "Unsupported Interval Expression with value {value:?}"
+                )));
+            }
+        };
+
+        if interval_period > (i64::MAX as f64) {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        let it = IntervalType::from_str(interval_type).map_err(|_| {
+            ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}"
+            ))
+        })?;
+
+        // Disallow duplicate interval types
+        if used_interval_types & (it as u16) != 0 {
+            return Err(ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}. Repeated type '{interval_type}'"
+            )));
+        } else {
+            used_interval_types |= it as u16;
+        }
+
+        match it {
+            IntervalType::Century => {
+                Ok(align_interval_parts(interval_period * 1200_f64, 0.0, 0.0))
+            }
+            IntervalType::Decade => {
+                Ok(align_interval_parts(interval_period * 120_f64, 0.0, 0.0))
+            }
+            IntervalType::Year => {
+                Ok(align_interval_parts(interval_period * 12_f64, 0.0, 0.0))
+            }
+            IntervalType::Month => Ok(align_interval_parts(interval_period, 0.0, 0.0)),
+            IntervalType::Week => {
+                Ok(align_interval_parts(0.0, interval_period * 7_f64, 0.0))
+            }
+            IntervalType::Day => Ok(align_interval_parts(0.0, interval_period, 0.0)),
+            IntervalType::Hour => {
+                Ok((0, 0, interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND))
+            }
+            IntervalType::Minute => {
+                Ok((0, 0, interval_period * 60_f64 * NANOS_PER_SECOND))
+            }
+            IntervalType::Second => Ok((0, 0, interval_period * NANOS_PER_SECOND)),
+            IntervalType::Millisecond => Ok((0, 0, interval_period * 1_000_000f64)),
+        }
+    };
+
+    let mut result_month: i64 = 0;
+    let mut result_days: i64 = 0;
+    let mut result_nanos: i128 = 0;
+
+    let mut parts = value.split_whitespace();
+
+    loop {
+        let interval_period_str = parts.next();
+        if interval_period_str.is_none() {
+            break;
+        }
+
+        let unit = parts.next().unwrap_or(leading_field);
+
+        let (diff_month, diff_days, diff_nanos) =
+            calculate_from_part(interval_period_str.unwrap(), unit)?;
+
+        result_month += diff_month;
+
+        if result_month > (i32::MAX as i64) {
+            return Err(ArrowError::NotYetImplemented(format!(

Review Comment:
   It may be an `InvalidArgumentError`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] doki23 commented on a diff in pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "doki23 (via GitHub)" <gi...@apache.org>.
doki23 commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1127332823


##########
arrow-cast/src/parse.rs:
##########
@@ -445,6 +446,237 @@ impl Parser for Date64Type {
     }
 }
 
+pub fn parse_interval_year_month(
+    value: &str,
+) -> Result<<IntervalYearMonthType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("years", value)?;
+    if result_days != 0 || result_nanos != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalYearMonth because the value isn't multiple of months"
+        )));
+    }
+    Ok(IntervalYearMonthType::make_value(0, result_months))
+}
+
+pub fn parse_interval_day_time(
+    value: &str,
+) -> Result<<IntervalDayTimeType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?;
+    if result_nanos % 1_000_000 != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds"
+        )));
+    }
+    result_days += result_months * 30;
+    Ok(IntervalDayTimeType::make_value(
+        result_days,
+        (result_nanos / 1_000_000) as i32,
+    ))
+}
+
+pub fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("months", value)?;
+    Ok(IntervalMonthDayNanoType::make_value(
+        result_months,
+        result_days,
+        result_nanos,
+    ))
+}
+
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_MILLIS: f64 = 1_000_000_f64;
+const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS;
+const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND;
+const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE;
+const NANOS_PER_DAY: f64 = 24_f64 * NANOS_PER_HOUR;
+
+#[derive(Clone, Copy)]
+#[repr(u16)]
+enum IntervalType {
+    Century = 0b_00_0000_0001,
+    Decade = 0b_00_0000_0010,
+    Year = 0b_00_0000_0100,
+    Month = 0b_00_0000_1000,
+    Week = 0b_00_0001_0000,
+    Day = 0b_00_0010_0000,
+    Hour = 0b_00_0100_0000,
+    Minute = 0b_00_1000_0000,
+    Second = 0b_01_0000_0000,
+    Millisecond = 0b_10_0000_0000,
+}
+
+impl FromStr for IntervalType {
+    type Err = ArrowError;
+
+    fn from_str(s: &str) -> Result<Self, ArrowError> {
+        match s.to_lowercase().as_str() {
+            "century" | "centuries" => Ok(Self::Century),
+            "decade" | "decades" => Ok(Self::Decade),
+            "year" | "years" => Ok(Self::Year),
+            "month" | "months" => Ok(Self::Month),
+            "week" | "weeks" => Ok(Self::Week),
+            "day" | "days" => Ok(Self::Day),
+            "hour" | "hours" => Ok(Self::Hour),
+            "minute" | "minutes" => Ok(Self::Minute),
+            "second" | "seconds" => Ok(Self::Second),
+            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
+            _ => Err(ArrowError::NotYetImplemented(format!(
+                "Unknown interval type: {s}"
+            ))),
+        }
+    }
+}
+
+pub type MonthDayNano = (i32, i32, i64);
+
+/// parse string value to a triple of aligned months, days, nanos.
+/// Fractional units must be spilled to smaller units.
+/// Fractional parts of units greater than months are rounded to be an integer number of months,
+/// e.g. '1.5 years' becomes '12 mons + 6 mons', returns (18, 0, 0)
+/// Fractional parts of months, weeks, days, hours, minutes, seconds and milliseconds are computed
+/// to be an integer number of days and nanoseconds, assuming 30 days per month and 24 hours per day,
+/// e.g., '1.75 months' becomes '1 mon + 22 days + 12 hours', returns (1, 22, 12 * `NANOS_PER_HOUR`)
+/// leading field is the default unit. e.g. leading field is `second`, `1` = `1 second`
+fn parse_interval(leading_field: &str, value: &str) -> Result<MonthDayNano, ArrowError> {
+    let mut used_interval_types = 0;
+
+    let mut calculate_from_part = |interval_period_str: &str,
+                                   interval_type: &str|
+     -> Result<(i32, i32, i64), ArrowError> {
+        // @todo It's better to use Decimal in order to protect rounding errors
+        // Wait https://github.com/apache/arrow/pull/9232
+        let interval_period = match f64::from_str(interval_period_str) {
+            Ok(n) => n,
+            Err(_) => {
+                return Err(ArrowError::NotYetImplemented(format!(
+                    "Unsupported Interval Expression with value {value:?}"
+                )));
+            }
+        };
+
+        if interval_period > (i64::MAX as f64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        let it = IntervalType::from_str(interval_type).map_err(|_| {
+            ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}"
+            ))
+        })?;
+
+        // Disallow duplicate interval types
+        if used_interval_types & (it as u16) != 0 {
+            return Err(ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}. Repeated type '{interval_type}'"
+            )));
+        } else {
+            used_interval_types |= it as u16;
+        }
+
+        match it {
+            IntervalType::Century => {
+                align_interval_parts(interval_period * 1200_f64, 0.0, 0.0)
+            }
+            IntervalType::Decade => {
+                align_interval_parts(interval_period * 120_f64, 0.0, 0.0)
+            }
+            IntervalType::Year => {
+                align_interval_parts(interval_period * 12_f64, 0.0, 0.0)
+            }
+            IntervalType::Month => align_interval_parts(interval_period, 0.0, 0.0),
+            IntervalType::Week => align_interval_parts(0.0, interval_period * 7_f64, 0.0),
+            IntervalType::Day => align_interval_parts(0.0, interval_period, 0.0),
+            IntervalType::Hour => Ok((
+                0,
+                0,
+                (interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND) as i64,
+            )),
+            IntervalType::Minute => {
+                Ok((0, 0, (interval_period * 60_f64 * NANOS_PER_SECOND) as i64))
+            }
+            IntervalType::Second => {
+                Ok((0, 0, (interval_period * NANOS_PER_SECOND) as i64))
+            }
+            IntervalType::Millisecond => {
+                Ok((0, 0, (interval_period * 1_000_000f64) as i64))
+            }
+        }
+    };
+
+    let mut result_month: i32 = 0;
+    let mut result_days: i32 = 0;
+    let mut result_nanos: i64 = 0;
+
+    let mut parts = value.split_whitespace();
+
+    while let Some(interval_period_str) = parts.next() {
+        let unit = parts.next().unwrap_or(leading_field);
+
+        let (diff_month, diff_days, diff_nanos) =
+            calculate_from_part(interval_period_str, unit)?;
+
+        result_month =
+            result_month
+                .checked_add(diff_month)
+                .ok_or(ArrowError::ParseError(format!(
+                    "Interval field value out of range: {value:?}"
+                )))?;
+
+        result_days =
+            result_days
+                .checked_add(diff_days)
+                .ok_or(ArrowError::ParseError(format!(
+                    "Interval field value out of range: {value:?}"
+                )))?;
+
+        result_nanos =
+            result_nanos
+                .checked_add(diff_nanos)
+                .ok_or(ArrowError::ParseError(format!(
+                    "Interval field value out of range: {value:?}"
+                )))?;
+    }
+
+    Ok((result_month, result_days, result_nanos))
+}
+
+/// We are storing parts as integers, it's why we need to align parts fractional
+/// INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days
+/// INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours
+/// INTERVAL '30 DAYS' = 1 MONTH
+fn align_interval_parts(
+    mut month_part: f64,
+    mut day_part: f64,
+    mut nanos_part: f64,
+) -> Result<(i32, i32, i64), ArrowError> {
+    // Convert fractional month to days, It's not supported by Arrow types, but anyway
+    day_part += (month_part - (month_part as i64) as f64) * 30_f64;
+
+    // Convert fractional days to hours
+    nanos_part += (day_part - ((day_part as i64) as f64))
+        * 24_f64
+        * SECONDS_PER_HOUR
+        * NANOS_PER_SECOND;
+
+    // Convert to higher units as much as possible
+    day_part += ((nanos_part as i64) / (NANOS_PER_DAY as i64)) as f64;
+    month_part += ((day_part as i64) / 30_i64) as f64;
+    nanos_part %= NANOS_PER_DAY;
+    day_part %= 30_f64;

Review Comment:
   I agree.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] doki23 commented on pull request #3762: Cast string to interval

Posted by "doki23 (via GitHub)" <gi...@apache.org>.
doki23 commented on PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#issuecomment-1446577236

   It seems that the parse function is too complex to get auto-vectorized.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] tustvold commented on a diff in pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "tustvold (via GitHub)" <gi...@apache.org>.
tustvold commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1126763320


##########
arrow-cast/src/parse.rs:
##########
@@ -445,6 +446,231 @@ impl Parser for Date64Type {
     }
 }
 
+pub(crate) fn parse_interval_year_month(
+    value: &str,
+) -> Result<<IntervalYearMonthType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("years", value)?;
+    if result_days != 0 || result_nanos != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalYearMonth because the value isn't multiple of months"
+        )));
+    }
+    Ok(IntervalYearMonthType::make_value(0, result_months))
+}
+
+pub(crate) fn parse_interval_day_time(
+    value: &str,
+) -> Result<<IntervalDayTimeType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?;
+    if result_nanos % 1_000_000 != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds"
+        )));
+    }
+    result_days += result_months * 30;
+    Ok(IntervalDayTimeType::make_value(
+        result_days,
+        (result_nanos / 1_000_000) as i32,
+    ))
+}
+
+pub(crate) fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("months", value)?;
+    Ok(IntervalMonthDayNanoType::make_value(
+        result_months,
+        result_days,
+        result_nanos,
+    ))
+}
+
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_MILLIS: f64 = 1_000_000_f64;
+const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS;
+const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND;
+const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE;
+const NANOS_PER_DAY: f64 = 24_f64 * NANOS_PER_HOUR;
+
+#[derive(Clone, Copy)]
+#[repr(u16)]
+enum IntervalType {
+    Century = 0b_00_0000_0001,
+    Decade = 0b_00_0000_0010,
+    Year = 0b_00_0000_0100,
+    Month = 0b_00_0000_1000,
+    Week = 0b_00_0001_0000,
+    Day = 0b_00_0010_0000,
+    Hour = 0b_00_0100_0000,
+    Minute = 0b_00_1000_0000,
+    Second = 0b_01_0000_0000,
+    Millisecond = 0b_10_0000_0000,
+}
+
+impl FromStr for IntervalType {
+    type Err = ArrowError;
+
+    fn from_str(s: &str) -> Result<Self, ArrowError> {
+        match s.to_lowercase().as_str() {
+            "century" | "centuries" => Ok(Self::Century),
+            "decade" | "decades" => Ok(Self::Decade),
+            "year" | "years" => Ok(Self::Year),
+            "month" | "months" => Ok(Self::Month),
+            "week" | "weeks" => Ok(Self::Week),
+            "day" | "days" => Ok(Self::Day),
+            "hour" | "hours" => Ok(Self::Hour),
+            "minute" | "minutes" => Ok(Self::Minute),
+            "second" | "seconds" => Ok(Self::Second),
+            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
+            _ => Err(ArrowError::NotYetImplemented(format!(
+                "Unknown interval type: {s}"
+            ))),
+        }
+    }
+}
+
+pub type MonthDayNano = (i32, i32, i64);
+
+/// parse string value to a triple of aligned months, days, nanos
+pub fn parse_interval(
+    leading_field: &str,
+    value: &str,
+) -> Result<MonthDayNano, ArrowError> {
+    let mut used_interval_types = 0;
+
+    let mut calculate_from_part = |interval_period_str: &str,
+                                   interval_type: &str|
+     -> Result<(i64, i64, f64), ArrowError> {
+        // @todo It's better to use Decimal in order to protect rounding errors
+        // Wait https://github.com/apache/arrow/pull/9232
+        let interval_period = match f64::from_str(interval_period_str) {
+            Ok(n) => n,
+            Err(_) => {
+                return Err(ArrowError::NotYetImplemented(format!(
+                    "Unsupported Interval Expression with value {value:?}"
+                )));
+            }
+        };
+
+        if interval_period > (i64::MAX as f64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        let it = IntervalType::from_str(interval_type).map_err(|_| {
+            ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}"
+            ))
+        })?;
+
+        // Disallow duplicate interval types
+        if used_interval_types & (it as u16) != 0 {
+            return Err(ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}. Repeated type '{interval_type}'"
+            )));
+        } else {
+            used_interval_types |= it as u16;
+        }
+
+        match it {
+            IntervalType::Century => {
+                Ok(align_interval_parts(interval_period * 1200_f64, 0.0, 0.0))
+            }
+            IntervalType::Decade => {
+                Ok(align_interval_parts(interval_period * 120_f64, 0.0, 0.0))
+            }
+            IntervalType::Year => {
+                Ok(align_interval_parts(interval_period * 12_f64, 0.0, 0.0))
+            }
+            IntervalType::Month => Ok(align_interval_parts(interval_period, 0.0, 0.0)),
+            IntervalType::Week => {
+                Ok(align_interval_parts(0.0, interval_period * 7_f64, 0.0))
+            }
+            IntervalType::Day => Ok(align_interval_parts(0.0, interval_period, 0.0)),
+            IntervalType::Hour => {
+                Ok((0, 0, interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND))
+            }
+            IntervalType::Minute => {
+                Ok((0, 0, interval_period * 60_f64 * NANOS_PER_SECOND))
+            }
+            IntervalType::Second => Ok((0, 0, interval_period * NANOS_PER_SECOND)),
+            IntervalType::Millisecond => Ok((0, 0, interval_period * 1_000_000f64)),
+        }
+    };
+
+    let mut result_month: i64 = 0;
+    let mut result_days: i64 = 0;
+    let mut result_nanos: i128 = 0;
+
+    let mut parts = value.split_whitespace();
+
+    loop {
+        let interval_period_str = parts.next();
+        if interval_period_str.is_none() {
+            break;
+        }
+
+        let unit = parts.next().unwrap_or(leading_field);
+
+        let (diff_month, diff_days, diff_nanos) =
+            calculate_from_part(interval_period_str.unwrap(), unit)?;
+
+        result_month += diff_month;
+
+        if result_month > (i32::MAX as i64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        result_days += diff_days;
+
+        if result_days > (i32::MAX as i64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        result_nanos += diff_nanos as i128;
+
+        if result_nanos > (i64::MAX as i128) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+    }
+
+    Ok((result_month as i32, result_days as i32, result_nanos as i64))
+}
+
+/// We are storing parts as integers, it's why we need to align parts fractional
+/// INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days
+/// INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours
+/// INTERVAL '30 DAYS' = 1 month
+fn align_interval_parts(
+    mut month_part: f64,
+    mut day_part: f64,
+    mut nanos_part: f64,
+) -> (i64, i64, f64) {
+    // Convert fractional month to days, It's not supported by Arrow types, but anyway
+    day_part += (month_part - (month_part as i64) as f64) * 30_f64;
+
+    // Convert fractional days to hours
+    nanos_part += (day_part - ((day_part as i64) as f64))
+        * 24_f64
+        * SECONDS_PER_HOUR
+        * NANOS_PER_SECOND;
+
+    // Convert to higher units as much as possible

Review Comment:
   I don't think this is correct, the number of days in a month is not fixed. The postgres docs make an exception for the case of fractional dates, but I don't see any indication it does this in the general case



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] alamb commented on pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "alamb (via GitHub)" <gi...@apache.org>.
alamb commented on PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#issuecomment-1457991408

   Thanks again @doki23  and @tustvold 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] doki23 commented on a diff in pull request #3762: Cast string to interval

Posted by "doki23 (via GitHub)" <gi...@apache.org>.
doki23 commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1125460995


##########
arrow-cast/src/parse.rs:
##########
@@ -445,6 +446,231 @@ impl Parser for Date64Type {
     }
 }
 
+pub(crate) fn parse_interval_year_month(
+    value: &str,
+) -> Result<<IntervalYearMonthType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("years", value)?;
+    if result_days != 0 || result_nanos != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalYearMonth because the value isn't multiple of months"
+        )));
+    }
+    Ok(IntervalYearMonthType::make_value(0, result_months))
+}
+
+pub(crate) fn parse_interval_day_time(
+    value: &str,
+) -> Result<<IntervalDayTimeType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?;
+    if result_nanos % 1_000_000 != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds"
+        )));
+    }
+    result_days += result_months * 30;
+    Ok(IntervalDayTimeType::make_value(
+        result_days,
+        (result_nanos / 1_000_000) as i32,
+    ))
+}
+
+pub(crate) fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("months", value)?;
+    Ok(IntervalMonthDayNanoType::make_value(
+        result_months,
+        result_days,
+        result_nanos,
+    ))
+}
+
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_MILLIS: f64 = 1_000_000_f64;
+const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS;
+const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND;
+const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE;
+const NANOS_PER_DAY: f64 = 24_f64 * NANOS_PER_HOUR;
+
+#[derive(Clone, Copy)]
+#[repr(u16)]
+enum IntervalType {
+    Century = 0b_00_0000_0001,
+    Decade = 0b_00_0000_0010,
+    Year = 0b_00_0000_0100,
+    Month = 0b_00_0000_1000,
+    Week = 0b_00_0001_0000,
+    Day = 0b_00_0010_0000,
+    Hour = 0b_00_0100_0000,
+    Minute = 0b_00_1000_0000,
+    Second = 0b_01_0000_0000,
+    Millisecond = 0b_10_0000_0000,
+}
+
+impl FromStr for IntervalType {
+    type Err = ArrowError;
+
+    fn from_str(s: &str) -> Result<Self, ArrowError> {
+        match s.to_lowercase().as_str() {
+            "century" | "centuries" => Ok(Self::Century),
+            "decade" | "decades" => Ok(Self::Decade),
+            "year" | "years" => Ok(Self::Year),
+            "month" | "months" => Ok(Self::Month),
+            "week" | "weeks" => Ok(Self::Week),
+            "day" | "days" => Ok(Self::Day),
+            "hour" | "hours" => Ok(Self::Hour),
+            "minute" | "minutes" => Ok(Self::Minute),
+            "second" | "seconds" => Ok(Self::Second),
+            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
+            _ => Err(ArrowError::NotYetImplemented(format!(
+                "Unknown interval type: {s}"
+            ))),
+        }
+    }
+}
+
+pub type MonthDayNano = (i32, i32, i64);
+
+/// parse string value to a triple of aligned months, days, nanos
+pub fn parse_interval(
+    leading_field: &str,
+    value: &str,
+) -> Result<MonthDayNano, ArrowError> {
+    let mut used_interval_types = 0;
+
+    let mut calculate_from_part = |interval_period_str: &str,
+                                   interval_type: &str|
+     -> Result<(i64, i64, f64), ArrowError> {
+        // @todo It's better to use Decimal in order to protect rounding errors
+        // Wait https://github.com/apache/arrow/pull/9232
+        let interval_period = match f64::from_str(interval_period_str) {
+            Ok(n) => n,
+            Err(_) => {
+                return Err(ArrowError::NotYetImplemented(format!(
+                    "Unsupported Interval Expression with value {value:?}"
+                )));
+            }
+        };
+
+        if interval_period > (i64::MAX as f64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        let it = IntervalType::from_str(interval_type).map_err(|_| {
+            ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}"
+            ))
+        })?;
+
+        // Disallow duplicate interval types
+        if used_interval_types & (it as u16) != 0 {
+            return Err(ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}. Repeated type '{interval_type}'"
+            )));
+        } else {
+            used_interval_types |= it as u16;
+        }
+
+        match it {
+            IntervalType::Century => {
+                Ok(align_interval_parts(interval_period * 1200_f64, 0.0, 0.0))
+            }
+            IntervalType::Decade => {
+                Ok(align_interval_parts(interval_period * 120_f64, 0.0, 0.0))
+            }
+            IntervalType::Year => {
+                Ok(align_interval_parts(interval_period * 12_f64, 0.0, 0.0))
+            }
+            IntervalType::Month => Ok(align_interval_parts(interval_period, 0.0, 0.0)),
+            IntervalType::Week => {
+                Ok(align_interval_parts(0.0, interval_period * 7_f64, 0.0))
+            }
+            IntervalType::Day => Ok(align_interval_parts(0.0, interval_period, 0.0)),
+            IntervalType::Hour => {
+                Ok((0, 0, interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND))
+            }
+            IntervalType::Minute => {
+                Ok((0, 0, interval_period * 60_f64 * NANOS_PER_SECOND))
+            }
+            IntervalType::Second => Ok((0, 0, interval_period * NANOS_PER_SECOND)),
+            IntervalType::Millisecond => Ok((0, 0, interval_period * 1_000_000f64)),
+        }
+    };
+
+    let mut result_month: i64 = 0;
+    let mut result_days: i64 = 0;
+    let mut result_nanos: i128 = 0;
+
+    let mut parts = value.split_whitespace();
+
+    loop {
+        let interval_period_str = parts.next();
+        if interval_period_str.is_none() {
+            break;
+        }
+
+        let unit = parts.next().unwrap_or(leading_field);
+
+        let (diff_month, diff_days, diff_nanos) =
+            calculate_from_part(interval_period_str.unwrap(), unit)?;
+
+        result_month += diff_month;
+
+        if result_month > (i32::MAX as i64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        result_days += diff_days;
+
+        if result_days > (i32::MAX as i64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        result_nanos += diff_nanos as i128;
+
+        if result_nanos > (i64::MAX as i128) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+    }
+
+    Ok((result_month as i32, result_days as i32, result_nanos as i64))
+}
+
+/// We are storing parts as integers, it's why we need to align parts fractional
+/// INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days
+/// INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours
+/// INTERVAL '30 DAYS' = 1 month
+fn align_interval_parts(
+    mut month_part: f64,
+    mut day_part: f64,
+    mut nanos_part: f64,
+) -> (i64, i64, f64) {
+    // Convert fractional month to days, It's not supported by Arrow types, but anyway
+    day_part += (month_part - (month_part as i64) as f64) * 30_f64;
+
+    // Convert fractional days to hours
+    nanos_part += (day_part - ((day_part as i64) as f64))
+        * 24_f64
+        * SECONDS_PER_HOUR
+        * NANOS_PER_SECOND;
+
+    // Convert to higher units as much as possible

Review Comment:
   @alamb Thank you for your review and I made some changes here. For instance, it converts 31 days to 31 days before, but now, it will return 1 month 1 day. I think it's more ergonomic.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] tustvold commented on a diff in pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "tustvold (via GitHub)" <gi...@apache.org>.
tustvold commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1126195163


##########
arrow-cast/src/parse.rs:
##########
@@ -445,6 +446,231 @@ impl Parser for Date64Type {
     }
 }
 
+pub(crate) fn parse_interval_year_month(
+    value: &str,
+) -> Result<<IntervalYearMonthType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("years", value)?;
+    if result_days != 0 || result_nanos != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalYearMonth because the value isn't multiple of months"
+        )));
+    }
+    Ok(IntervalYearMonthType::make_value(0, result_months))
+}
+
+pub(crate) fn parse_interval_day_time(
+    value: &str,
+) -> Result<<IntervalDayTimeType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?;
+    if result_nanos % 1_000_000 != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds"
+        )));
+    }
+    result_days += result_months * 30;
+    Ok(IntervalDayTimeType::make_value(
+        result_days,
+        (result_nanos / 1_000_000) as i32,
+    ))
+}
+
+pub(crate) fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("months", value)?;
+    Ok(IntervalMonthDayNanoType::make_value(
+        result_months,
+        result_days,
+        result_nanos,
+    ))
+}
+
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_MILLIS: f64 = 1_000_000_f64;
+const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS;
+const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND;
+const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE;
+const NANOS_PER_DAY: f64 = 24_f64 * NANOS_PER_HOUR;
+
+#[derive(Clone, Copy)]
+#[repr(u16)]
+enum IntervalType {
+    Century = 0b_00_0000_0001,
+    Decade = 0b_00_0000_0010,
+    Year = 0b_00_0000_0100,
+    Month = 0b_00_0000_1000,
+    Week = 0b_00_0001_0000,
+    Day = 0b_00_0010_0000,
+    Hour = 0b_00_0100_0000,
+    Minute = 0b_00_1000_0000,
+    Second = 0b_01_0000_0000,
+    Millisecond = 0b_10_0000_0000,
+}
+
+impl FromStr for IntervalType {
+    type Err = ArrowError;
+
+    fn from_str(s: &str) -> Result<Self, ArrowError> {
+        match s.to_lowercase().as_str() {
+            "century" | "centuries" => Ok(Self::Century),
+            "decade" | "decades" => Ok(Self::Decade),
+            "year" | "years" => Ok(Self::Year),
+            "month" | "months" => Ok(Self::Month),
+            "week" | "weeks" => Ok(Self::Week),
+            "day" | "days" => Ok(Self::Day),
+            "hour" | "hours" => Ok(Self::Hour),
+            "minute" | "minutes" => Ok(Self::Minute),
+            "second" | "seconds" => Ok(Self::Second),
+            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
+            _ => Err(ArrowError::NotYetImplemented(format!(
+                "Unknown interval type: {s}"
+            ))),
+        }
+    }
+}
+
+pub type MonthDayNano = (i32, i32, i64);
+
+/// parse string value to a triple of aligned months, days, nanos
+pub fn parse_interval(
+    leading_field: &str,
+    value: &str,
+) -> Result<MonthDayNano, ArrowError> {
+    let mut used_interval_types = 0;
+
+    let mut calculate_from_part = |interval_period_str: &str,
+                                   interval_type: &str|
+     -> Result<(i64, i64, f64), ArrowError> {
+        // @todo It's better to use Decimal in order to protect rounding errors
+        // Wait https://github.com/apache/arrow/pull/9232
+        let interval_period = match f64::from_str(interval_period_str) {
+            Ok(n) => n,
+            Err(_) => {
+                return Err(ArrowError::NotYetImplemented(format!(
+                    "Unsupported Interval Expression with value {value:?}"
+                )));
+            }
+        };
+
+        if interval_period > (i64::MAX as f64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        let it = IntervalType::from_str(interval_type).map_err(|_| {
+            ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}"
+            ))
+        })?;
+
+        // Disallow duplicate interval types
+        if used_interval_types & (it as u16) != 0 {
+            return Err(ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}. Repeated type '{interval_type}'"
+            )));
+        } else {
+            used_interval_types |= it as u16;
+        }
+
+        match it {
+            IntervalType::Century => {
+                Ok(align_interval_parts(interval_period * 1200_f64, 0.0, 0.0))
+            }
+            IntervalType::Decade => {
+                Ok(align_interval_parts(interval_period * 120_f64, 0.0, 0.0))
+            }
+            IntervalType::Year => {
+                Ok(align_interval_parts(interval_period * 12_f64, 0.0, 0.0))
+            }
+            IntervalType::Month => Ok(align_interval_parts(interval_period, 0.0, 0.0)),
+            IntervalType::Week => {
+                Ok(align_interval_parts(0.0, interval_period * 7_f64, 0.0))
+            }
+            IntervalType::Day => Ok(align_interval_parts(0.0, interval_period, 0.0)),
+            IntervalType::Hour => {
+                Ok((0, 0, interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND))
+            }
+            IntervalType::Minute => {
+                Ok((0, 0, interval_period * 60_f64 * NANOS_PER_SECOND))
+            }
+            IntervalType::Second => Ok((0, 0, interval_period * NANOS_PER_SECOND)),
+            IntervalType::Millisecond => Ok((0, 0, interval_period * 1_000_000f64)),
+        }
+    };
+
+    let mut result_month: i64 = 0;
+    let mut result_days: i64 = 0;
+    let mut result_nanos: i128 = 0;
+
+    let mut parts = value.split_whitespace();
+
+    loop {
+        let interval_period_str = parts.next();
+        if interval_period_str.is_none() {
+            break;
+        }
+
+        let unit = parts.next().unwrap_or(leading_field);
+
+        let (diff_month, diff_days, diff_nanos) =
+            calculate_from_part(interval_period_str.unwrap(), unit)?;
+
+        result_month += diff_month;
+
+        if result_month > (i32::MAX as i64) {

Review Comment:
   This overflow checking logic is incorrect, as the above computation could wrap around to a value that doesn't look like it has overflowed. The line above should use checked_add instead



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] tustvold commented on a diff in pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "tustvold (via GitHub)" <gi...@apache.org>.
tustvold commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1126190855


##########
arrow-cast/src/parse.rs:
##########
@@ -871,6 +1097,117 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_parse_interval() {

Review Comment:
   I think it would be good to have a test of the negative fractional quantities e.g. `-1.1 month`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] tustvold commented on a diff in pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "tustvold (via GitHub)" <gi...@apache.org>.
tustvold commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1126195163


##########
arrow-cast/src/parse.rs:
##########
@@ -445,6 +446,231 @@ impl Parser for Date64Type {
     }
 }
 
+pub(crate) fn parse_interval_year_month(
+    value: &str,
+) -> Result<<IntervalYearMonthType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("years", value)?;
+    if result_days != 0 || result_nanos != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalYearMonth because the value isn't multiple of months"
+        )));
+    }
+    Ok(IntervalYearMonthType::make_value(0, result_months))
+}
+
+pub(crate) fn parse_interval_day_time(
+    value: &str,
+) -> Result<<IntervalDayTimeType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?;
+    if result_nanos % 1_000_000 != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds"
+        )));
+    }
+    result_days += result_months * 30;
+    Ok(IntervalDayTimeType::make_value(
+        result_days,
+        (result_nanos / 1_000_000) as i32,
+    ))
+}
+
+pub(crate) fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("months", value)?;
+    Ok(IntervalMonthDayNanoType::make_value(
+        result_months,
+        result_days,
+        result_nanos,
+    ))
+}
+
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_MILLIS: f64 = 1_000_000_f64;
+const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS;
+const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND;
+const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE;
+const NANOS_PER_DAY: f64 = 24_f64 * NANOS_PER_HOUR;
+
+#[derive(Clone, Copy)]
+#[repr(u16)]
+enum IntervalType {
+    Century = 0b_00_0000_0001,
+    Decade = 0b_00_0000_0010,
+    Year = 0b_00_0000_0100,
+    Month = 0b_00_0000_1000,
+    Week = 0b_00_0001_0000,
+    Day = 0b_00_0010_0000,
+    Hour = 0b_00_0100_0000,
+    Minute = 0b_00_1000_0000,
+    Second = 0b_01_0000_0000,
+    Millisecond = 0b_10_0000_0000,
+}
+
+impl FromStr for IntervalType {
+    type Err = ArrowError;
+
+    fn from_str(s: &str) -> Result<Self, ArrowError> {
+        match s.to_lowercase().as_str() {
+            "century" | "centuries" => Ok(Self::Century),
+            "decade" | "decades" => Ok(Self::Decade),
+            "year" | "years" => Ok(Self::Year),
+            "month" | "months" => Ok(Self::Month),
+            "week" | "weeks" => Ok(Self::Week),
+            "day" | "days" => Ok(Self::Day),
+            "hour" | "hours" => Ok(Self::Hour),
+            "minute" | "minutes" => Ok(Self::Minute),
+            "second" | "seconds" => Ok(Self::Second),
+            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
+            _ => Err(ArrowError::NotYetImplemented(format!(
+                "Unknown interval type: {s}"
+            ))),
+        }
+    }
+}
+
+pub type MonthDayNano = (i32, i32, i64);
+
+/// parse string value to a triple of aligned months, days, nanos
+pub fn parse_interval(
+    leading_field: &str,
+    value: &str,
+) -> Result<MonthDayNano, ArrowError> {
+    let mut used_interval_types = 0;
+
+    let mut calculate_from_part = |interval_period_str: &str,
+                                   interval_type: &str|
+     -> Result<(i64, i64, f64), ArrowError> {
+        // @todo It's better to use Decimal in order to protect rounding errors
+        // Wait https://github.com/apache/arrow/pull/9232
+        let interval_period = match f64::from_str(interval_period_str) {
+            Ok(n) => n,
+            Err(_) => {
+                return Err(ArrowError::NotYetImplemented(format!(
+                    "Unsupported Interval Expression with value {value:?}"
+                )));
+            }
+        };
+
+        if interval_period > (i64::MAX as f64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        let it = IntervalType::from_str(interval_type).map_err(|_| {
+            ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}"
+            ))
+        })?;
+
+        // Disallow duplicate interval types
+        if used_interval_types & (it as u16) != 0 {
+            return Err(ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}. Repeated type '{interval_type}'"
+            )));
+        } else {
+            used_interval_types |= it as u16;
+        }
+
+        match it {
+            IntervalType::Century => {
+                Ok(align_interval_parts(interval_period * 1200_f64, 0.0, 0.0))
+            }
+            IntervalType::Decade => {
+                Ok(align_interval_parts(interval_period * 120_f64, 0.0, 0.0))
+            }
+            IntervalType::Year => {
+                Ok(align_interval_parts(interval_period * 12_f64, 0.0, 0.0))
+            }
+            IntervalType::Month => Ok(align_interval_parts(interval_period, 0.0, 0.0)),
+            IntervalType::Week => {
+                Ok(align_interval_parts(0.0, interval_period * 7_f64, 0.0))
+            }
+            IntervalType::Day => Ok(align_interval_parts(0.0, interval_period, 0.0)),
+            IntervalType::Hour => {
+                Ok((0, 0, interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND))
+            }
+            IntervalType::Minute => {
+                Ok((0, 0, interval_period * 60_f64 * NANOS_PER_SECOND))
+            }
+            IntervalType::Second => Ok((0, 0, interval_period * NANOS_PER_SECOND)),
+            IntervalType::Millisecond => Ok((0, 0, interval_period * 1_000_000f64)),
+        }
+    };
+
+    let mut result_month: i64 = 0;
+    let mut result_days: i64 = 0;
+    let mut result_nanos: i128 = 0;
+
+    let mut parts = value.split_whitespace();
+
+    loop {
+        let interval_period_str = parts.next();
+        if interval_period_str.is_none() {
+            break;
+        }
+
+        let unit = parts.next().unwrap_or(leading_field);
+
+        let (diff_month, diff_days, diff_nanos) =
+            calculate_from_part(interval_period_str.unwrap(), unit)?;
+
+        result_month += diff_month;
+
+        if result_month > (i32::MAX as i64) {

Review Comment:
   This overflow checking logic is incorrect, as the above computation could wrap around to a value that doesn't look like it has overflowed



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] tustvold commented on a diff in pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "tustvold (via GitHub)" <gi...@apache.org>.
tustvold commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1126189888


##########
arrow-cast/src/parse.rs:
##########
@@ -445,6 +446,231 @@ impl Parser for Date64Type {
     }
 }
 
+pub(crate) fn parse_interval_year_month(
+    value: &str,
+) -> Result<<IntervalYearMonthType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("years", value)?;
+    if result_days != 0 || result_nanos != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalYearMonth because the value isn't multiple of months"
+        )));
+    }
+    Ok(IntervalYearMonthType::make_value(0, result_months))
+}
+
+pub(crate) fn parse_interval_day_time(
+    value: &str,
+) -> Result<<IntervalDayTimeType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?;
+    if result_nanos % 1_000_000 != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds"
+        )));
+    }
+    result_days += result_months * 30;
+    Ok(IntervalDayTimeType::make_value(
+        result_days,
+        (result_nanos / 1_000_000) as i32,
+    ))
+}
+
+pub(crate) fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("months", value)?;
+    Ok(IntervalMonthDayNanoType::make_value(
+        result_months,
+        result_days,
+        result_nanos,
+    ))
+}
+
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_MILLIS: f64 = 1_000_000_f64;
+const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS;
+const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND;
+const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE;
+const NANOS_PER_DAY: f64 = 24_f64 * NANOS_PER_HOUR;
+
+#[derive(Clone, Copy)]
+#[repr(u16)]
+enum IntervalType {
+    Century = 0b_00_0000_0001,
+    Decade = 0b_00_0000_0010,
+    Year = 0b_00_0000_0100,
+    Month = 0b_00_0000_1000,
+    Week = 0b_00_0001_0000,
+    Day = 0b_00_0010_0000,
+    Hour = 0b_00_0100_0000,
+    Minute = 0b_00_1000_0000,
+    Second = 0b_01_0000_0000,
+    Millisecond = 0b_10_0000_0000,
+}
+
+impl FromStr for IntervalType {
+    type Err = ArrowError;
+
+    fn from_str(s: &str) -> Result<Self, ArrowError> {
+        match s.to_lowercase().as_str() {
+            "century" | "centuries" => Ok(Self::Century),
+            "decade" | "decades" => Ok(Self::Decade),
+            "year" | "years" => Ok(Self::Year),
+            "month" | "months" => Ok(Self::Month),
+            "week" | "weeks" => Ok(Self::Week),
+            "day" | "days" => Ok(Self::Day),
+            "hour" | "hours" => Ok(Self::Hour),
+            "minute" | "minutes" => Ok(Self::Minute),
+            "second" | "seconds" => Ok(Self::Second),
+            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
+            _ => Err(ArrowError::NotYetImplemented(format!(
+                "Unknown interval type: {s}"
+            ))),
+        }
+    }
+}
+
+pub type MonthDayNano = (i32, i32, i64);
+
+/// parse string value to a triple of aligned months, days, nanos
+pub fn parse_interval(
+    leading_field: &str,
+    value: &str,
+) -> Result<MonthDayNano, ArrowError> {
+    let mut used_interval_types = 0;
+
+    let mut calculate_from_part = |interval_period_str: &str,
+                                   interval_type: &str|
+     -> Result<(i64, i64, f64), ArrowError> {
+        // @todo It's better to use Decimal in order to protect rounding errors
+        // Wait https://github.com/apache/arrow/pull/9232
+        let interval_period = match f64::from_str(interval_period_str) {
+            Ok(n) => n,
+            Err(_) => {
+                return Err(ArrowError::NotYetImplemented(format!(
+                    "Unsupported Interval Expression with value {value:?}"
+                )));
+            }
+        };
+
+        if interval_period > (i64::MAX as f64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        let it = IntervalType::from_str(interval_type).map_err(|_| {
+            ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}"
+            ))
+        })?;
+
+        // Disallow duplicate interval types
+        if used_interval_types & (it as u16) != 0 {
+            return Err(ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}. Repeated type '{interval_type}'"
+            )));
+        } else {
+            used_interval_types |= it as u16;
+        }
+
+        match it {
+            IntervalType::Century => {
+                Ok(align_interval_parts(interval_period * 1200_f64, 0.0, 0.0))
+            }
+            IntervalType::Decade => {
+                Ok(align_interval_parts(interval_period * 120_f64, 0.0, 0.0))
+            }
+            IntervalType::Year => {
+                Ok(align_interval_parts(interval_period * 12_f64, 0.0, 0.0))
+            }
+            IntervalType::Month => Ok(align_interval_parts(interval_period, 0.0, 0.0)),
+            IntervalType::Week => {
+                Ok(align_interval_parts(0.0, interval_period * 7_f64, 0.0))
+            }
+            IntervalType::Day => Ok(align_interval_parts(0.0, interval_period, 0.0)),
+            IntervalType::Hour => {
+                Ok((0, 0, interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND))
+            }
+            IntervalType::Minute => {
+                Ok((0, 0, interval_period * 60_f64 * NANOS_PER_SECOND))
+            }
+            IntervalType::Second => Ok((0, 0, interval_period * NANOS_PER_SECOND)),
+            IntervalType::Millisecond => Ok((0, 0, interval_period * 1_000_000f64)),
+        }
+    };
+
+    let mut result_month: i64 = 0;
+    let mut result_days: i64 = 0;
+    let mut result_nanos: i128 = 0;
+
+    let mut parts = value.split_whitespace();
+
+    loop {
+        let interval_period_str = parts.next();
+        if interval_period_str.is_none() {
+            break;
+        }
+
+        let unit = parts.next().unwrap_or(leading_field);
+
+        let (diff_month, diff_days, diff_nanos) =
+            calculate_from_part(interval_period_str.unwrap(), unit)?;
+
+        result_month += diff_month;
+
+        if result_month > (i32::MAX as i64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        result_days += diff_days;
+
+        if result_days > (i32::MAX as i64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        result_nanos += diff_nanos as i128;
+
+        if result_nanos > (i64::MAX as i128) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+    }
+
+    Ok((result_month as i32, result_days as i32, result_nanos as i64))
+}
+
+/// We are storing parts as integers, it's why we need to align parts fractional
+/// INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days
+/// INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours
+/// INTERVAL '30 DAYS' = 1 month
+fn align_interval_parts(
+    mut month_part: f64,
+    mut day_part: f64,
+    mut nanos_part: f64,
+) -> (i64, i64, f64) {
+    // Convert fractional month to days, It's not supported by Arrow types, but anyway
+    day_part += (month_part - (month_part as i64) as f64) * 30_f64;
+
+    // Convert fractional days to hours
+    nanos_part += (day_part - ((day_part as i64) as f64))

Review Comment:
   I'm not sure this truncation logic is correct for negative quantities



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] tustvold commented on a diff in pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "tustvold (via GitHub)" <gi...@apache.org>.
tustvold commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1126188441


##########
arrow-cast/src/parse.rs:
##########
@@ -445,6 +446,231 @@ impl Parser for Date64Type {
     }
 }
 
+pub(crate) fn parse_interval_year_month(
+    value: &str,
+) -> Result<<IntervalYearMonthType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("years", value)?;
+    if result_days != 0 || result_nanos != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalYearMonth because the value isn't multiple of months"
+        )));
+    }
+    Ok(IntervalYearMonthType::make_value(0, result_months))
+}
+
+pub(crate) fn parse_interval_day_time(
+    value: &str,
+) -> Result<<IntervalDayTimeType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?;
+    if result_nanos % 1_000_000 != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds"
+        )));
+    }
+    result_days += result_months * 30;
+    Ok(IntervalDayTimeType::make_value(
+        result_days,
+        (result_nanos / 1_000_000) as i32,
+    ))
+}
+
+pub(crate) fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("months", value)?;
+    Ok(IntervalMonthDayNanoType::make_value(
+        result_months,
+        result_days,
+        result_nanos,
+    ))
+}
+
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_MILLIS: f64 = 1_000_000_f64;
+const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS;
+const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND;
+const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE;
+const NANOS_PER_DAY: f64 = 24_f64 * NANOS_PER_HOUR;
+
+#[derive(Clone, Copy)]
+#[repr(u16)]
+enum IntervalType {
+    Century = 0b_00_0000_0001,
+    Decade = 0b_00_0000_0010,
+    Year = 0b_00_0000_0100,
+    Month = 0b_00_0000_1000,
+    Week = 0b_00_0001_0000,
+    Day = 0b_00_0010_0000,
+    Hour = 0b_00_0100_0000,
+    Minute = 0b_00_1000_0000,
+    Second = 0b_01_0000_0000,
+    Millisecond = 0b_10_0000_0000,
+}
+
+impl FromStr for IntervalType {
+    type Err = ArrowError;
+
+    fn from_str(s: &str) -> Result<Self, ArrowError> {
+        match s.to_lowercase().as_str() {
+            "century" | "centuries" => Ok(Self::Century),
+            "decade" | "decades" => Ok(Self::Decade),
+            "year" | "years" => Ok(Self::Year),
+            "month" | "months" => Ok(Self::Month),
+            "week" | "weeks" => Ok(Self::Week),
+            "day" | "days" => Ok(Self::Day),
+            "hour" | "hours" => Ok(Self::Hour),
+            "minute" | "minutes" => Ok(Self::Minute),
+            "second" | "seconds" => Ok(Self::Second),
+            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
+            _ => Err(ArrowError::NotYetImplemented(format!(
+                "Unknown interval type: {s}"
+            ))),
+        }
+    }
+}
+
+pub type MonthDayNano = (i32, i32, i64);
+
+/// parse string value to a triple of aligned months, days, nanos
+pub fn parse_interval(
+    leading_field: &str,
+    value: &str,
+) -> Result<MonthDayNano, ArrowError> {
+    let mut used_interval_types = 0;
+
+    let mut calculate_from_part = |interval_period_str: &str,
+                                   interval_type: &str|
+     -> Result<(i64, i64, f64), ArrowError> {
+        // @todo It's better to use Decimal in order to protect rounding errors
+        // Wait https://github.com/apache/arrow/pull/9232
+        let interval_period = match f64::from_str(interval_period_str) {
+            Ok(n) => n,
+            Err(_) => {
+                return Err(ArrowError::NotYetImplemented(format!(
+                    "Unsupported Interval Expression with value {value:?}"
+                )));
+            }
+        };
+
+        if interval_period > (i64::MAX as f64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        let it = IntervalType::from_str(interval_type).map_err(|_| {
+            ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}"
+            ))
+        })?;
+
+        // Disallow duplicate interval types
+        if used_interval_types & (it as u16) != 0 {
+            return Err(ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}. Repeated type '{interval_type}'"
+            )));
+        } else {
+            used_interval_types |= it as u16;
+        }
+
+        match it {
+            IntervalType::Century => {
+                Ok(align_interval_parts(interval_period * 1200_f64, 0.0, 0.0))
+            }
+            IntervalType::Decade => {
+                Ok(align_interval_parts(interval_period * 120_f64, 0.0, 0.0))
+            }
+            IntervalType::Year => {
+                Ok(align_interval_parts(interval_period * 12_f64, 0.0, 0.0))
+            }
+            IntervalType::Month => Ok(align_interval_parts(interval_period, 0.0, 0.0)),
+            IntervalType::Week => {
+                Ok(align_interval_parts(0.0, interval_period * 7_f64, 0.0))
+            }
+            IntervalType::Day => Ok(align_interval_parts(0.0, interval_period, 0.0)),
+            IntervalType::Hour => {
+                Ok((0, 0, interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND))
+            }
+            IntervalType::Minute => {
+                Ok((0, 0, interval_period * 60_f64 * NANOS_PER_SECOND))
+            }
+            IntervalType::Second => Ok((0, 0, interval_period * NANOS_PER_SECOND)),
+            IntervalType::Millisecond => Ok((0, 0, interval_period * 1_000_000f64)),
+        }
+    };
+
+    let mut result_month: i64 = 0;
+    let mut result_days: i64 = 0;
+    let mut result_nanos: i128 = 0;
+
+    let mut parts = value.split_whitespace();
+
+    loop {
+        let interval_period_str = parts.next();
+        if interval_period_str.is_none() {
+            break;
+        }
+
+        let unit = parts.next().unwrap_or(leading_field);
+
+        let (diff_month, diff_days, diff_nanos) =
+            calculate_from_part(interval_period_str.unwrap(), unit)?;
+
+        result_month += diff_month;
+
+        if result_month > (i32::MAX as i64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        result_days += diff_days;
+
+        if result_days > (i32::MAX as i64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        result_nanos += diff_nanos as i128;
+
+        if result_nanos > (i64::MAX as i128) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+    }
+
+    Ok((result_month as i32, result_days as i32, result_nanos as i64))
+}
+
+/// We are storing parts as integers, it's why we need to align parts fractional
+/// INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days
+/// INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours
+/// INTERVAL '30 DAYS' = 1 month
+fn align_interval_parts(
+    mut month_part: f64,
+    mut day_part: f64,
+    mut nanos_part: f64,
+) -> (i64, i64, f64) {
+    // Convert fractional month to days, It's not supported by Arrow types, but anyway

Review Comment:
   Is this correct? It seems to assume a month has 30 days, which isn't true?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] doki23 commented on pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "doki23 (via GitHub)" <gi...@apache.org>.
doki23 commented on PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#issuecomment-1457264684

   @alamb I'm glad to stick on this because your reviews bring me some programing inspirations,it is fun to me.


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] doki23 commented on a diff in pull request #3762: Cast string to interval

Posted by "doki23 (via GitHub)" <gi...@apache.org>.
doki23 commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1125460995


##########
arrow-cast/src/parse.rs:
##########
@@ -445,6 +446,231 @@ impl Parser for Date64Type {
     }
 }
 
+pub(crate) fn parse_interval_year_month(
+    value: &str,
+) -> Result<<IntervalYearMonthType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("years", value)?;
+    if result_days != 0 || result_nanos != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalYearMonth because the value isn't multiple of months"
+        )));
+    }
+    Ok(IntervalYearMonthType::make_value(0, result_months))
+}
+
+pub(crate) fn parse_interval_day_time(
+    value: &str,
+) -> Result<<IntervalDayTimeType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?;
+    if result_nanos % 1_000_000 != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds"
+        )));
+    }
+    result_days += result_months * 30;
+    Ok(IntervalDayTimeType::make_value(
+        result_days,
+        (result_nanos / 1_000_000) as i32,
+    ))
+}
+
+pub(crate) fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("months", value)?;
+    Ok(IntervalMonthDayNanoType::make_value(
+        result_months,
+        result_days,
+        result_nanos,
+    ))
+}
+
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_MILLIS: f64 = 1_000_000_f64;
+const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS;
+const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND;
+const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE;
+const NANOS_PER_DAY: f64 = 24_f64 * NANOS_PER_HOUR;
+
+#[derive(Clone, Copy)]
+#[repr(u16)]
+enum IntervalType {
+    Century = 0b_00_0000_0001,
+    Decade = 0b_00_0000_0010,
+    Year = 0b_00_0000_0100,
+    Month = 0b_00_0000_1000,
+    Week = 0b_00_0001_0000,
+    Day = 0b_00_0010_0000,
+    Hour = 0b_00_0100_0000,
+    Minute = 0b_00_1000_0000,
+    Second = 0b_01_0000_0000,
+    Millisecond = 0b_10_0000_0000,
+}
+
+impl FromStr for IntervalType {
+    type Err = ArrowError;
+
+    fn from_str(s: &str) -> Result<Self, ArrowError> {
+        match s.to_lowercase().as_str() {
+            "century" | "centuries" => Ok(Self::Century),
+            "decade" | "decades" => Ok(Self::Decade),
+            "year" | "years" => Ok(Self::Year),
+            "month" | "months" => Ok(Self::Month),
+            "week" | "weeks" => Ok(Self::Week),
+            "day" | "days" => Ok(Self::Day),
+            "hour" | "hours" => Ok(Self::Hour),
+            "minute" | "minutes" => Ok(Self::Minute),
+            "second" | "seconds" => Ok(Self::Second),
+            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
+            _ => Err(ArrowError::NotYetImplemented(format!(
+                "Unknown interval type: {s}"
+            ))),
+        }
+    }
+}
+
+pub type MonthDayNano = (i32, i32, i64);
+
+/// parse string value to a triple of aligned months, days, nanos
+pub fn parse_interval(
+    leading_field: &str,
+    value: &str,
+) -> Result<MonthDayNano, ArrowError> {
+    let mut used_interval_types = 0;
+
+    let mut calculate_from_part = |interval_period_str: &str,
+                                   interval_type: &str|
+     -> Result<(i64, i64, f64), ArrowError> {
+        // @todo It's better to use Decimal in order to protect rounding errors
+        // Wait https://github.com/apache/arrow/pull/9232
+        let interval_period = match f64::from_str(interval_period_str) {
+            Ok(n) => n,
+            Err(_) => {
+                return Err(ArrowError::NotYetImplemented(format!(
+                    "Unsupported Interval Expression with value {value:?}"
+                )));
+            }
+        };
+
+        if interval_period > (i64::MAX as f64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        let it = IntervalType::from_str(interval_type).map_err(|_| {
+            ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}"
+            ))
+        })?;
+
+        // Disallow duplicate interval types
+        if used_interval_types & (it as u16) != 0 {
+            return Err(ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}. Repeated type '{interval_type}'"
+            )));
+        } else {
+            used_interval_types |= it as u16;
+        }
+
+        match it {
+            IntervalType::Century => {
+                Ok(align_interval_parts(interval_period * 1200_f64, 0.0, 0.0))
+            }
+            IntervalType::Decade => {
+                Ok(align_interval_parts(interval_period * 120_f64, 0.0, 0.0))
+            }
+            IntervalType::Year => {
+                Ok(align_interval_parts(interval_period * 12_f64, 0.0, 0.0))
+            }
+            IntervalType::Month => Ok(align_interval_parts(interval_period, 0.0, 0.0)),
+            IntervalType::Week => {
+                Ok(align_interval_parts(0.0, interval_period * 7_f64, 0.0))
+            }
+            IntervalType::Day => Ok(align_interval_parts(0.0, interval_period, 0.0)),
+            IntervalType::Hour => {
+                Ok((0, 0, interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND))
+            }
+            IntervalType::Minute => {
+                Ok((0, 0, interval_period * 60_f64 * NANOS_PER_SECOND))
+            }
+            IntervalType::Second => Ok((0, 0, interval_period * NANOS_PER_SECOND)),
+            IntervalType::Millisecond => Ok((0, 0, interval_period * 1_000_000f64)),
+        }
+    };
+
+    let mut result_month: i64 = 0;
+    let mut result_days: i64 = 0;
+    let mut result_nanos: i128 = 0;
+
+    let mut parts = value.split_whitespace();
+
+    loop {
+        let interval_period_str = parts.next();
+        if interval_period_str.is_none() {
+            break;
+        }
+
+        let unit = parts.next().unwrap_or(leading_field);
+
+        let (diff_month, diff_days, diff_nanos) =
+            calculate_from_part(interval_period_str.unwrap(), unit)?;
+
+        result_month += diff_month;
+
+        if result_month > (i32::MAX as i64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        result_days += diff_days;
+
+        if result_days > (i32::MAX as i64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        result_nanos += diff_nanos as i128;
+
+        if result_nanos > (i64::MAX as i128) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+    }
+
+    Ok((result_month as i32, result_days as i32, result_nanos as i64))
+}
+
+/// We are storing parts as integers, it's why we need to align parts fractional
+/// INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days
+/// INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours
+/// INTERVAL '30 DAYS' = 1 month
+fn align_interval_parts(
+    mut month_part: f64,
+    mut day_part: f64,
+    mut nanos_part: f64,
+) -> (i64, i64, f64) {
+    // Convert fractional month to days, It's not supported by Arrow types, but anyway
+    day_part += (month_part - (month_part as i64) as f64) * 30_f64;
+
+    // Convert fractional days to hours
+    nanos_part += (day_part - ((day_part as i64) as f64))
+        * 24_f64
+        * SECONDS_PER_HOUR
+        * NANOS_PER_SECOND;
+
+    // Convert to higher units as much as possible

Review Comment:
   @alamb Thank you for your review and I made some changes here. For instance, it converts 30 days to 30 days before, but now, it will return 1 month. I think it's more ergonomic.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] tustvold commented on a diff in pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "tustvold (via GitHub)" <gi...@apache.org>.
tustvold commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1126805189


##########
arrow-cast/src/parse.rs:
##########
@@ -445,6 +446,237 @@ impl Parser for Date64Type {
     }
 }
 
+pub fn parse_interval_year_month(
+    value: &str,
+) -> Result<<IntervalYearMonthType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("years", value)?;
+    if result_days != 0 || result_nanos != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalYearMonth because the value isn't multiple of months"
+        )));
+    }
+    Ok(IntervalYearMonthType::make_value(0, result_months))
+}
+
+pub fn parse_interval_day_time(
+    value: &str,
+) -> Result<<IntervalDayTimeType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?;
+    if result_nanos % 1_000_000 != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds"
+        )));
+    }
+    result_days += result_months * 30;
+    Ok(IntervalDayTimeType::make_value(
+        result_days,
+        (result_nanos / 1_000_000) as i32,
+    ))
+}
+
+pub fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("months", value)?;
+    Ok(IntervalMonthDayNanoType::make_value(
+        result_months,
+        result_days,
+        result_nanos,
+    ))
+}
+
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_MILLIS: f64 = 1_000_000_f64;
+const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS;
+const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND;
+const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE;
+const NANOS_PER_DAY: f64 = 24_f64 * NANOS_PER_HOUR;
+
+#[derive(Clone, Copy)]
+#[repr(u16)]
+enum IntervalType {
+    Century = 0b_00_0000_0001,
+    Decade = 0b_00_0000_0010,
+    Year = 0b_00_0000_0100,
+    Month = 0b_00_0000_1000,
+    Week = 0b_00_0001_0000,
+    Day = 0b_00_0010_0000,
+    Hour = 0b_00_0100_0000,
+    Minute = 0b_00_1000_0000,
+    Second = 0b_01_0000_0000,
+    Millisecond = 0b_10_0000_0000,
+}
+
+impl FromStr for IntervalType {
+    type Err = ArrowError;
+
+    fn from_str(s: &str) -> Result<Self, ArrowError> {
+        match s.to_lowercase().as_str() {
+            "century" | "centuries" => Ok(Self::Century),
+            "decade" | "decades" => Ok(Self::Decade),
+            "year" | "years" => Ok(Self::Year),
+            "month" | "months" => Ok(Self::Month),
+            "week" | "weeks" => Ok(Self::Week),
+            "day" | "days" => Ok(Self::Day),
+            "hour" | "hours" => Ok(Self::Hour),
+            "minute" | "minutes" => Ok(Self::Minute),
+            "second" | "seconds" => Ok(Self::Second),
+            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
+            _ => Err(ArrowError::NotYetImplemented(format!(
+                "Unknown interval type: {s}"
+            ))),
+        }
+    }
+}
+
+pub type MonthDayNano = (i32, i32, i64);
+
+/// parse string value to a triple of aligned months, days, nanos.
+/// Fractional units must be spilled to smaller units.
+/// Fractional parts of units greater than months are rounded to be an integer number of months,
+/// e.g. '1.5 years' becomes '12 mons + 6 mons', returns (18, 0, 0)
+/// Fractional parts of months, weeks, days, hours, minutes, seconds and milliseconds are computed
+/// to be an integer number of days and nanoseconds, assuming 30 days per month and 24 hours per day,
+/// e.g., '1.75 months' becomes '1 mon + 22 days + 12 hours', returns (1, 22, 12 * `NANOS_PER_HOUR`)
+/// leading field is the default unit. e.g. leading field is `second`, `1` = `1 second`
+fn parse_interval(leading_field: &str, value: &str) -> Result<MonthDayNano, ArrowError> {
+    let mut used_interval_types = 0;
+
+    let mut calculate_from_part = |interval_period_str: &str,
+                                   interval_type: &str|
+     -> Result<(i32, i32, i64), ArrowError> {
+        // @todo It's better to use Decimal in order to protect rounding errors
+        // Wait https://github.com/apache/arrow/pull/9232
+        let interval_period = match f64::from_str(interval_period_str) {
+            Ok(n) => n,
+            Err(_) => {
+                return Err(ArrowError::NotYetImplemented(format!(
+                    "Unsupported Interval Expression with value {value:?}"
+                )));
+            }
+        };
+
+        if interval_period > (i64::MAX as f64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        let it = IntervalType::from_str(interval_type).map_err(|_| {
+            ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}"
+            ))
+        })?;
+
+        // Disallow duplicate interval types
+        if used_interval_types & (it as u16) != 0 {
+            return Err(ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}. Repeated type '{interval_type}'"
+            )));
+        } else {
+            used_interval_types |= it as u16;
+        }
+
+        match it {
+            IntervalType::Century => {
+                align_interval_parts(interval_period * 1200_f64, 0.0, 0.0)
+            }
+            IntervalType::Decade => {
+                align_interval_parts(interval_period * 120_f64, 0.0, 0.0)
+            }
+            IntervalType::Year => {
+                align_interval_parts(interval_period * 12_f64, 0.0, 0.0)
+            }
+            IntervalType::Month => align_interval_parts(interval_period, 0.0, 0.0),
+            IntervalType::Week => align_interval_parts(0.0, interval_period * 7_f64, 0.0),
+            IntervalType::Day => align_interval_parts(0.0, interval_period, 0.0),
+            IntervalType::Hour => Ok((
+                0,
+                0,
+                (interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND) as i64,
+            )),
+            IntervalType::Minute => {
+                Ok((0, 0, (interval_period * 60_f64 * NANOS_PER_SECOND) as i64))
+            }
+            IntervalType::Second => {
+                Ok((0, 0, (interval_period * NANOS_PER_SECOND) as i64))
+            }
+            IntervalType::Millisecond => {
+                Ok((0, 0, (interval_period * 1_000_000f64) as i64))
+            }
+        }
+    };
+
+    let mut result_month: i32 = 0;
+    let mut result_days: i32 = 0;
+    let mut result_nanos: i64 = 0;
+
+    let mut parts = value.split_whitespace();
+
+    while let Some(interval_period_str) = parts.next() {
+        let unit = parts.next().unwrap_or(leading_field);
+
+        let (diff_month, diff_days, diff_nanos) =
+            calculate_from_part(interval_period_str, unit)?;
+
+        result_month =
+            result_month
+                .checked_add(diff_month)
+                .ok_or(ArrowError::ParseError(format!(
+                    "Interval field value out of range: {value:?}"
+                )))?;
+
+        result_days =
+            result_days
+                .checked_add(diff_days)
+                .ok_or(ArrowError::ParseError(format!(
+                    "Interval field value out of range: {value:?}"
+                )))?;
+
+        result_nanos =
+            result_nanos
+                .checked_add(diff_nanos)
+                .ok_or(ArrowError::ParseError(format!(
+                    "Interval field value out of range: {value:?}"
+                )))?;
+    }
+
+    Ok((result_month, result_days, result_nanos))
+}
+
+/// We are storing parts as integers, it's why we need to align parts fractional
+/// INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days
+/// INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours
+/// INTERVAL '30 DAYS' = 1 MONTH
+fn align_interval_parts(
+    mut month_part: f64,
+    mut day_part: f64,
+    mut nanos_part: f64,
+) -> Result<(i32, i32, i64), ArrowError> {
+    // Convert fractional month to days, It's not supported by Arrow types, but anyway
+    day_part += (month_part - (month_part as i64) as f64) * 30_f64;
+
+    // Convert fractional days to hours
+    nanos_part += (day_part - ((day_part as i64) as f64))
+        * 24_f64
+        * SECONDS_PER_HOUR
+        * NANOS_PER_SECOND;
+
+    // Convert to higher units as much as possible
+    day_part += ((nanos_part as i64) / (NANOS_PER_DAY as i64)) as f64;
+    month_part += ((day_part as i64) / 30_i64) as f64;
+    nanos_part %= NANOS_PER_DAY;
+    day_part %= 30_f64;

Review Comment:
   ```suggestion
   ```
   I would suggest removing this, not only is it potentially incorrect as months don't have a fixed number of days, but also integer division is very slow (although LLVM may be smart enough to convert this to fixed point multiplication)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] alamb commented on pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "alamb (via GitHub)" <gi...@apache.org>.
alamb commented on PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#issuecomment-1456850940

   BTW thank you for sticking with this @doki23  -- I didn't realize how much more work would be needed after a more thorough review in arrow-rs 😓 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] doki23 commented on a diff in pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "doki23 (via GitHub)" <gi...@apache.org>.
doki23 commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1127193666


##########
arrow-cast/src/parse.rs:
##########
@@ -871,6 +1103,127 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_parse_interval() {
+        assert_eq!(
+            (1i32, 0i32, 0i64),
+            parse_interval("months", "1 month").unwrap(),
+        );
+
+        assert_eq!(
+            (2i32, 0i32, 0i64),
+            parse_interval("months", "2 month").unwrap(),
+        );
+
+        assert_eq!(
+            (-1i32, -18i32, (-0.2 * NANOS_PER_DAY) as i64),
+            parse_interval("months", "-1.5 months -3.2 days").unwrap(),

Review Comment:
   @tustvold Here it is.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] alamb commented on pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "alamb (via GitHub)" <gi...@apache.org>.
alamb commented on PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#issuecomment-1456851333

   Also, thank you @tustvold for the thorough review!


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] tustvold commented on pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "tustvold (via GitHub)" <gi...@apache.org>.
tustvold commented on PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#issuecomment-1457937874

   Thank you for this :+1: 


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] doki23 commented on a diff in pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "doki23 (via GitHub)" <gi...@apache.org>.
doki23 commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1127332823


##########
arrow-cast/src/parse.rs:
##########
@@ -445,6 +446,237 @@ impl Parser for Date64Type {
     }
 }
 
+pub fn parse_interval_year_month(
+    value: &str,
+) -> Result<<IntervalYearMonthType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("years", value)?;
+    if result_days != 0 || result_nanos != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalYearMonth because the value isn't multiple of months"
+        )));
+    }
+    Ok(IntervalYearMonthType::make_value(0, result_months))
+}
+
+pub fn parse_interval_day_time(
+    value: &str,
+) -> Result<<IntervalDayTimeType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?;
+    if result_nanos % 1_000_000 != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds"
+        )));
+    }
+    result_days += result_months * 30;
+    Ok(IntervalDayTimeType::make_value(
+        result_days,
+        (result_nanos / 1_000_000) as i32,
+    ))
+}
+
+pub fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("months", value)?;
+    Ok(IntervalMonthDayNanoType::make_value(
+        result_months,
+        result_days,
+        result_nanos,
+    ))
+}
+
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_MILLIS: f64 = 1_000_000_f64;
+const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS;
+const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND;
+const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE;
+const NANOS_PER_DAY: f64 = 24_f64 * NANOS_PER_HOUR;
+
+#[derive(Clone, Copy)]
+#[repr(u16)]
+enum IntervalType {
+    Century = 0b_00_0000_0001,
+    Decade = 0b_00_0000_0010,
+    Year = 0b_00_0000_0100,
+    Month = 0b_00_0000_1000,
+    Week = 0b_00_0001_0000,
+    Day = 0b_00_0010_0000,
+    Hour = 0b_00_0100_0000,
+    Minute = 0b_00_1000_0000,
+    Second = 0b_01_0000_0000,
+    Millisecond = 0b_10_0000_0000,
+}
+
+impl FromStr for IntervalType {
+    type Err = ArrowError;
+
+    fn from_str(s: &str) -> Result<Self, ArrowError> {
+        match s.to_lowercase().as_str() {
+            "century" | "centuries" => Ok(Self::Century),
+            "decade" | "decades" => Ok(Self::Decade),
+            "year" | "years" => Ok(Self::Year),
+            "month" | "months" => Ok(Self::Month),
+            "week" | "weeks" => Ok(Self::Week),
+            "day" | "days" => Ok(Self::Day),
+            "hour" | "hours" => Ok(Self::Hour),
+            "minute" | "minutes" => Ok(Self::Minute),
+            "second" | "seconds" => Ok(Self::Second),
+            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
+            _ => Err(ArrowError::NotYetImplemented(format!(
+                "Unknown interval type: {s}"
+            ))),
+        }
+    }
+}
+
+pub type MonthDayNano = (i32, i32, i64);
+
+/// parse string value to a triple of aligned months, days, nanos.
+/// Fractional units must be spilled to smaller units.
+/// Fractional parts of units greater than months are rounded to be an integer number of months,
+/// e.g. '1.5 years' becomes '12 mons + 6 mons', returns (18, 0, 0)
+/// Fractional parts of months, weeks, days, hours, minutes, seconds and milliseconds are computed
+/// to be an integer number of days and nanoseconds, assuming 30 days per month and 24 hours per day,
+/// e.g., '1.75 months' becomes '1 mon + 22 days + 12 hours', returns (1, 22, 12 * `NANOS_PER_HOUR`)
+/// leading field is the default unit. e.g. leading field is `second`, `1` = `1 second`
+fn parse_interval(leading_field: &str, value: &str) -> Result<MonthDayNano, ArrowError> {
+    let mut used_interval_types = 0;
+
+    let mut calculate_from_part = |interval_period_str: &str,
+                                   interval_type: &str|
+     -> Result<(i32, i32, i64), ArrowError> {
+        // @todo It's better to use Decimal in order to protect rounding errors
+        // Wait https://github.com/apache/arrow/pull/9232
+        let interval_period = match f64::from_str(interval_period_str) {
+            Ok(n) => n,
+            Err(_) => {
+                return Err(ArrowError::NotYetImplemented(format!(
+                    "Unsupported Interval Expression with value {value:?}"
+                )));
+            }
+        };
+
+        if interval_period > (i64::MAX as f64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        let it = IntervalType::from_str(interval_type).map_err(|_| {
+            ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}"
+            ))
+        })?;
+
+        // Disallow duplicate interval types
+        if used_interval_types & (it as u16) != 0 {
+            return Err(ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}. Repeated type '{interval_type}'"
+            )));
+        } else {
+            used_interval_types |= it as u16;
+        }
+
+        match it {
+            IntervalType::Century => {
+                align_interval_parts(interval_period * 1200_f64, 0.0, 0.0)
+            }
+            IntervalType::Decade => {
+                align_interval_parts(interval_period * 120_f64, 0.0, 0.0)
+            }
+            IntervalType::Year => {
+                align_interval_parts(interval_period * 12_f64, 0.0, 0.0)
+            }
+            IntervalType::Month => align_interval_parts(interval_period, 0.0, 0.0),
+            IntervalType::Week => align_interval_parts(0.0, interval_period * 7_f64, 0.0),
+            IntervalType::Day => align_interval_parts(0.0, interval_period, 0.0),
+            IntervalType::Hour => Ok((
+                0,
+                0,
+                (interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND) as i64,
+            )),
+            IntervalType::Minute => {
+                Ok((0, 0, (interval_period * 60_f64 * NANOS_PER_SECOND) as i64))
+            }
+            IntervalType::Second => {
+                Ok((0, 0, (interval_period * NANOS_PER_SECOND) as i64))
+            }
+            IntervalType::Millisecond => {
+                Ok((0, 0, (interval_period * 1_000_000f64) as i64))
+            }
+        }
+    };
+
+    let mut result_month: i32 = 0;
+    let mut result_days: i32 = 0;
+    let mut result_nanos: i64 = 0;
+
+    let mut parts = value.split_whitespace();
+
+    while let Some(interval_period_str) = parts.next() {
+        let unit = parts.next().unwrap_or(leading_field);
+
+        let (diff_month, diff_days, diff_nanos) =
+            calculate_from_part(interval_period_str, unit)?;
+
+        result_month =
+            result_month
+                .checked_add(diff_month)
+                .ok_or(ArrowError::ParseError(format!(
+                    "Interval field value out of range: {value:?}"
+                )))?;
+
+        result_days =
+            result_days
+                .checked_add(diff_days)
+                .ok_or(ArrowError::ParseError(format!(
+                    "Interval field value out of range: {value:?}"
+                )))?;
+
+        result_nanos =
+            result_nanos
+                .checked_add(diff_nanos)
+                .ok_or(ArrowError::ParseError(format!(
+                    "Interval field value out of range: {value:?}"
+                )))?;
+    }
+
+    Ok((result_month, result_days, result_nanos))
+}
+
+/// We are storing parts as integers, it's why we need to align parts fractional
+/// INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days
+/// INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours
+/// INTERVAL '30 DAYS' = 1 MONTH
+fn align_interval_parts(
+    mut month_part: f64,
+    mut day_part: f64,
+    mut nanos_part: f64,
+) -> Result<(i32, i32, i64), ArrowError> {
+    // Convert fractional month to days, It's not supported by Arrow types, but anyway
+    day_part += (month_part - (month_part as i64) as f64) * 30_f64;
+
+    // Convert fractional days to hours
+    nanos_part += (day_part - ((day_part as i64) as f64))
+        * 24_f64
+        * SECONDS_PER_HOUR
+        * NANOS_PER_SECOND;
+
+    // Convert to higher units as much as possible
+    day_part += ((nanos_part as i64) / (NANOS_PER_DAY as i64)) as f64;
+    month_part += ((day_part as i64) / 30_i64) as f64;
+    nanos_part %= NANOS_PER_DAY;
+    day_part %= 30_f64;

Review Comment:
   I agree.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] doki23 commented on a diff in pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "doki23 (via GitHub)" <gi...@apache.org>.
doki23 commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1126659594


##########
arrow-cast/src/parse.rs:
##########
@@ -445,6 +446,231 @@ impl Parser for Date64Type {
     }
 }
 
+pub(crate) fn parse_interval_year_month(
+    value: &str,
+) -> Result<<IntervalYearMonthType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("years", value)?;
+    if result_days != 0 || result_nanos != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalYearMonth because the value isn't multiple of months"
+        )));
+    }
+    Ok(IntervalYearMonthType::make_value(0, result_months))
+}
+
+pub(crate) fn parse_interval_day_time(
+    value: &str,
+) -> Result<<IntervalDayTimeType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?;
+    if result_nanos % 1_000_000 != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds"
+        )));
+    }
+    result_days += result_months * 30;
+    Ok(IntervalDayTimeType::make_value(
+        result_days,
+        (result_nanos / 1_000_000) as i32,
+    ))
+}
+
+pub(crate) fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("months", value)?;
+    Ok(IntervalMonthDayNanoType::make_value(
+        result_months,
+        result_days,
+        result_nanos,
+    ))
+}
+
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_MILLIS: f64 = 1_000_000_f64;
+const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS;
+const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND;
+const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE;
+const NANOS_PER_DAY: f64 = 24_f64 * NANOS_PER_HOUR;
+
+#[derive(Clone, Copy)]
+#[repr(u16)]
+enum IntervalType {
+    Century = 0b_00_0000_0001,
+    Decade = 0b_00_0000_0010,
+    Year = 0b_00_0000_0100,
+    Month = 0b_00_0000_1000,
+    Week = 0b_00_0001_0000,
+    Day = 0b_00_0010_0000,
+    Hour = 0b_00_0100_0000,
+    Minute = 0b_00_1000_0000,
+    Second = 0b_01_0000_0000,
+    Millisecond = 0b_10_0000_0000,
+}
+
+impl FromStr for IntervalType {
+    type Err = ArrowError;
+
+    fn from_str(s: &str) -> Result<Self, ArrowError> {
+        match s.to_lowercase().as_str() {
+            "century" | "centuries" => Ok(Self::Century),
+            "decade" | "decades" => Ok(Self::Decade),
+            "year" | "years" => Ok(Self::Year),
+            "month" | "months" => Ok(Self::Month),
+            "week" | "weeks" => Ok(Self::Week),
+            "day" | "days" => Ok(Self::Day),
+            "hour" | "hours" => Ok(Self::Hour),
+            "minute" | "minutes" => Ok(Self::Minute),
+            "second" | "seconds" => Ok(Self::Second),
+            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
+            _ => Err(ArrowError::NotYetImplemented(format!(
+                "Unknown interval type: {s}"
+            ))),
+        }
+    }
+}
+
+pub type MonthDayNano = (i32, i32, i64);
+
+/// parse string value to a triple of aligned months, days, nanos
+pub fn parse_interval(
+    leading_field: &str,
+    value: &str,
+) -> Result<MonthDayNano, ArrowError> {
+    let mut used_interval_types = 0;
+
+    let mut calculate_from_part = |interval_period_str: &str,
+                                   interval_type: &str|
+     -> Result<(i64, i64, f64), ArrowError> {

Review Comment:
   It's because smaller units can be converted to larger units. For instance, (i32::MAX + x) days can spill `(i32::MAX + x) / 30` months to the month part.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] tustvold commented on a diff in pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "tustvold (via GitHub)" <gi...@apache.org>.
tustvold commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1126186836


##########
arrow-cast/src/parse.rs:
##########
@@ -445,6 +446,231 @@ impl Parser for Date64Type {
     }
 }
 
+pub(crate) fn parse_interval_year_month(
+    value: &str,
+) -> Result<<IntervalYearMonthType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("years", value)?;
+    if result_days != 0 || result_nanos != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalYearMonth because the value isn't multiple of months"
+        )));
+    }
+    Ok(IntervalYearMonthType::make_value(0, result_months))
+}
+
+pub(crate) fn parse_interval_day_time(
+    value: &str,
+) -> Result<<IntervalDayTimeType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?;
+    if result_nanos % 1_000_000 != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds"
+        )));
+    }
+    result_days += result_months * 30;
+    Ok(IntervalDayTimeType::make_value(
+        result_days,
+        (result_nanos / 1_000_000) as i32,
+    ))
+}
+
+pub(crate) fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("months", value)?;
+    Ok(IntervalMonthDayNanoType::make_value(
+        result_months,
+        result_days,
+        result_nanos,
+    ))
+}
+
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_MILLIS: f64 = 1_000_000_f64;
+const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS;
+const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND;
+const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE;
+const NANOS_PER_DAY: f64 = 24_f64 * NANOS_PER_HOUR;
+
+#[derive(Clone, Copy)]
+#[repr(u16)]
+enum IntervalType {
+    Century = 0b_00_0000_0001,
+    Decade = 0b_00_0000_0010,
+    Year = 0b_00_0000_0100,
+    Month = 0b_00_0000_1000,
+    Week = 0b_00_0001_0000,
+    Day = 0b_00_0010_0000,
+    Hour = 0b_00_0100_0000,
+    Minute = 0b_00_1000_0000,
+    Second = 0b_01_0000_0000,
+    Millisecond = 0b_10_0000_0000,
+}
+
+impl FromStr for IntervalType {
+    type Err = ArrowError;
+
+    fn from_str(s: &str) -> Result<Self, ArrowError> {
+        match s.to_lowercase().as_str() {
+            "century" | "centuries" => Ok(Self::Century),
+            "decade" | "decades" => Ok(Self::Decade),
+            "year" | "years" => Ok(Self::Year),
+            "month" | "months" => Ok(Self::Month),
+            "week" | "weeks" => Ok(Self::Week),
+            "day" | "days" => Ok(Self::Day),
+            "hour" | "hours" => Ok(Self::Hour),
+            "minute" | "minutes" => Ok(Self::Minute),
+            "second" | "seconds" => Ok(Self::Second),
+            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
+            _ => Err(ArrowError::NotYetImplemented(format!(
+                "Unknown interval type: {s}"
+            ))),
+        }
+    }
+}
+
+pub type MonthDayNano = (i32, i32, i64);
+
+/// parse string value to a triple of aligned months, days, nanos
+pub fn parse_interval(
+    leading_field: &str,
+    value: &str,
+) -> Result<MonthDayNano, ArrowError> {
+    let mut used_interval_types = 0;
+
+    let mut calculate_from_part = |interval_period_str: &str,
+                                   interval_type: &str|
+     -> Result<(i64, i64, f64), ArrowError> {
+        // @todo It's better to use Decimal in order to protect rounding errors
+        // Wait https://github.com/apache/arrow/pull/9232
+        let interval_period = match f64::from_str(interval_period_str) {
+            Ok(n) => n,
+            Err(_) => {
+                return Err(ArrowError::NotYetImplemented(format!(
+                    "Unsupported Interval Expression with value {value:?}"
+                )));
+            }
+        };
+
+        if interval_period > (i64::MAX as f64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        let it = IntervalType::from_str(interval_type).map_err(|_| {
+            ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}"
+            ))
+        })?;
+
+        // Disallow duplicate interval types
+        if used_interval_types & (it as u16) != 0 {
+            return Err(ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}. Repeated type '{interval_type}'"
+            )));
+        } else {
+            used_interval_types |= it as u16;
+        }
+
+        match it {
+            IntervalType::Century => {
+                Ok(align_interval_parts(interval_period * 1200_f64, 0.0, 0.0))
+            }
+            IntervalType::Decade => {
+                Ok(align_interval_parts(interval_period * 120_f64, 0.0, 0.0))
+            }
+            IntervalType::Year => {
+                Ok(align_interval_parts(interval_period * 12_f64, 0.0, 0.0))
+            }
+            IntervalType::Month => Ok(align_interval_parts(interval_period, 0.0, 0.0)),
+            IntervalType::Week => {
+                Ok(align_interval_parts(0.0, interval_period * 7_f64, 0.0))
+            }
+            IntervalType::Day => Ok(align_interval_parts(0.0, interval_period, 0.0)),
+            IntervalType::Hour => {
+                Ok((0, 0, interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND))
+            }
+            IntervalType::Minute => {
+                Ok((0, 0, interval_period * 60_f64 * NANOS_PER_SECOND))
+            }
+            IntervalType::Second => Ok((0, 0, interval_period * NANOS_PER_SECOND)),
+            IntervalType::Millisecond => Ok((0, 0, interval_period * 1_000_000f64)),
+        }
+    };
+
+    let mut result_month: i64 = 0;
+    let mut result_days: i64 = 0;
+    let mut result_nanos: i128 = 0;
+
+    let mut parts = value.split_whitespace();
+
+    loop {
+        let interval_period_str = parts.next();
+        if interval_period_str.is_none() {
+            break;
+        }

Review Comment:
   ```suggestion
       while let Some(interval_period_str) = parts.next() {
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] doki23 commented on a diff in pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "doki23 (via GitHub)" <gi...@apache.org>.
doki23 commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1126716042


##########
arrow-cast/src/parse.rs:
##########
@@ -445,6 +446,231 @@ impl Parser for Date64Type {
     }
 }
 
+pub(crate) fn parse_interval_year_month(
+    value: &str,
+) -> Result<<IntervalYearMonthType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("years", value)?;
+    if result_days != 0 || result_nanos != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalYearMonth because the value isn't multiple of months"
+        )));
+    }
+    Ok(IntervalYearMonthType::make_value(0, result_months))
+}
+
+pub(crate) fn parse_interval_day_time(
+    value: &str,
+) -> Result<<IntervalDayTimeType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?;
+    if result_nanos % 1_000_000 != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds"
+        )));
+    }
+    result_days += result_months * 30;
+    Ok(IntervalDayTimeType::make_value(
+        result_days,
+        (result_nanos / 1_000_000) as i32,
+    ))
+}
+
+pub(crate) fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("months", value)?;
+    Ok(IntervalMonthDayNanoType::make_value(
+        result_months,
+        result_days,
+        result_nanos,
+    ))
+}
+
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_MILLIS: f64 = 1_000_000_f64;
+const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS;
+const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND;
+const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE;
+const NANOS_PER_DAY: f64 = 24_f64 * NANOS_PER_HOUR;
+
+#[derive(Clone, Copy)]
+#[repr(u16)]
+enum IntervalType {
+    Century = 0b_00_0000_0001,
+    Decade = 0b_00_0000_0010,
+    Year = 0b_00_0000_0100,
+    Month = 0b_00_0000_1000,
+    Week = 0b_00_0001_0000,
+    Day = 0b_00_0010_0000,
+    Hour = 0b_00_0100_0000,
+    Minute = 0b_00_1000_0000,
+    Second = 0b_01_0000_0000,
+    Millisecond = 0b_10_0000_0000,
+}
+
+impl FromStr for IntervalType {
+    type Err = ArrowError;
+
+    fn from_str(s: &str) -> Result<Self, ArrowError> {
+        match s.to_lowercase().as_str() {
+            "century" | "centuries" => Ok(Self::Century),
+            "decade" | "decades" => Ok(Self::Decade),
+            "year" | "years" => Ok(Self::Year),
+            "month" | "months" => Ok(Self::Month),
+            "week" | "weeks" => Ok(Self::Week),
+            "day" | "days" => Ok(Self::Day),
+            "hour" | "hours" => Ok(Self::Hour),
+            "minute" | "minutes" => Ok(Self::Minute),
+            "second" | "seconds" => Ok(Self::Second),
+            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
+            _ => Err(ArrowError::NotYetImplemented(format!(
+                "Unknown interval type: {s}"
+            ))),
+        }
+    }
+}
+
+pub type MonthDayNano = (i32, i32, i64);
+
+/// parse string value to a triple of aligned months, days, nanos
+pub fn parse_interval(
+    leading_field: &str,
+    value: &str,
+) -> Result<MonthDayNano, ArrowError> {
+    let mut used_interval_types = 0;
+
+    let mut calculate_from_part = |interval_period_str: &str,
+                                   interval_type: &str|
+     -> Result<(i64, i64, f64), ArrowError> {
+        // @todo It's better to use Decimal in order to protect rounding errors
+        // Wait https://github.com/apache/arrow/pull/9232
+        let interval_period = match f64::from_str(interval_period_str) {
+            Ok(n) => n,
+            Err(_) => {
+                return Err(ArrowError::NotYetImplemented(format!(
+                    "Unsupported Interval Expression with value {value:?}"
+                )));
+            }
+        };
+
+        if interval_period > (i64::MAX as f64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        let it = IntervalType::from_str(interval_type).map_err(|_| {
+            ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}"
+            ))
+        })?;
+
+        // Disallow duplicate interval types
+        if used_interval_types & (it as u16) != 0 {
+            return Err(ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}. Repeated type '{interval_type}'"
+            )));
+        } else {
+            used_interval_types |= it as u16;
+        }
+
+        match it {
+            IntervalType::Century => {
+                Ok(align_interval_parts(interval_period * 1200_f64, 0.0, 0.0))
+            }
+            IntervalType::Decade => {
+                Ok(align_interval_parts(interval_period * 120_f64, 0.0, 0.0))
+            }
+            IntervalType::Year => {
+                Ok(align_interval_parts(interval_period * 12_f64, 0.0, 0.0))
+            }
+            IntervalType::Month => Ok(align_interval_parts(interval_period, 0.0, 0.0)),
+            IntervalType::Week => {
+                Ok(align_interval_parts(0.0, interval_period * 7_f64, 0.0))
+            }
+            IntervalType::Day => Ok(align_interval_parts(0.0, interval_period, 0.0)),
+            IntervalType::Hour => {
+                Ok((0, 0, interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND))
+            }
+            IntervalType::Minute => {
+                Ok((0, 0, interval_period * 60_f64 * NANOS_PER_SECOND))
+            }
+            IntervalType::Second => Ok((0, 0, interval_period * NANOS_PER_SECOND)),
+            IntervalType::Millisecond => Ok((0, 0, interval_period * 1_000_000f64)),
+        }
+    };
+
+    let mut result_month: i64 = 0;
+    let mut result_days: i64 = 0;
+    let mut result_nanos: i128 = 0;
+
+    let mut parts = value.split_whitespace();
+
+    loop {
+        let interval_period_str = parts.next();
+        if interval_period_str.is_none() {
+            break;
+        }
+
+        let unit = parts.next().unwrap_or(leading_field);
+
+        let (diff_month, diff_days, diff_nanos) =
+            calculate_from_part(interval_period_str.unwrap(), unit)?;
+
+        result_month += diff_month;
+
+        if result_month > (i32::MAX as i64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        result_days += diff_days;
+
+        if result_days > (i32::MAX as i64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        result_nanos += diff_nanos as i128;
+
+        if result_nanos > (i64::MAX as i128) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+    }
+
+    Ok((result_month as i32, result_days as i32, result_nanos as i64))
+}
+
+/// We are storing parts as integers, it's why we need to align parts fractional
+/// INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days
+/// INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours
+/// INTERVAL '30 DAYS' = 1 month
+fn align_interval_parts(
+    mut month_part: f64,
+    mut day_part: f64,
+    mut nanos_part: f64,
+) -> (i64, i64, f64) {
+    // Convert fractional month to days, It's not supported by Arrow types, but anyway
+    day_part += (month_part - (month_part as i64) as f64) * 30_f64;
+
+    // Convert fractional days to hours
+    nanos_part += (day_part - ((day_part as i64) as f64))

Review Comment:
   It's ok, I've added an unit test for it.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] tustvold commented on a diff in pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "tustvold (via GitHub)" <gi...@apache.org>.
tustvold commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1126772103


##########
arrow-cast/src/parse.rs:
##########
@@ -445,6 +446,237 @@ impl Parser for Date64Type {
     }
 }
 
+pub fn parse_interval_year_month(
+    value: &str,
+) -> Result<<IntervalYearMonthType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("years", value)?;
+    if result_days != 0 || result_nanos != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalYearMonth because the value isn't multiple of months"
+        )));
+    }
+    Ok(IntervalYearMonthType::make_value(0, result_months))
+}
+
+pub fn parse_interval_day_time(
+    value: &str,
+) -> Result<<IntervalDayTimeType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?;
+    if result_nanos % 1_000_000 != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds"
+        )));
+    }
+    result_days += result_months * 30;
+    Ok(IntervalDayTimeType::make_value(
+        result_days,
+        (result_nanos / 1_000_000) as i32,
+    ))
+}
+
+pub fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("months", value)?;
+    Ok(IntervalMonthDayNanoType::make_value(
+        result_months,
+        result_days,
+        result_nanos,
+    ))
+}
+
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_MILLIS: f64 = 1_000_000_f64;
+const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS;
+const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND;
+const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE;
+const NANOS_PER_DAY: f64 = 24_f64 * NANOS_PER_HOUR;
+
+#[derive(Clone, Copy)]
+#[repr(u16)]
+enum IntervalType {
+    Century = 0b_00_0000_0001,
+    Decade = 0b_00_0000_0010,
+    Year = 0b_00_0000_0100,
+    Month = 0b_00_0000_1000,
+    Week = 0b_00_0001_0000,
+    Day = 0b_00_0010_0000,
+    Hour = 0b_00_0100_0000,
+    Minute = 0b_00_1000_0000,
+    Second = 0b_01_0000_0000,
+    Millisecond = 0b_10_0000_0000,
+}
+
+impl FromStr for IntervalType {
+    type Err = ArrowError;
+
+    fn from_str(s: &str) -> Result<Self, ArrowError> {
+        match s.to_lowercase().as_str() {
+            "century" | "centuries" => Ok(Self::Century),
+            "decade" | "decades" => Ok(Self::Decade),
+            "year" | "years" => Ok(Self::Year),
+            "month" | "months" => Ok(Self::Month),
+            "week" | "weeks" => Ok(Self::Week),
+            "day" | "days" => Ok(Self::Day),
+            "hour" | "hours" => Ok(Self::Hour),
+            "minute" | "minutes" => Ok(Self::Minute),
+            "second" | "seconds" => Ok(Self::Second),
+            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
+            _ => Err(ArrowError::NotYetImplemented(format!(
+                "Unknown interval type: {s}"
+            ))),
+        }
+    }
+}
+
+pub type MonthDayNano = (i32, i32, i64);
+
+/// parse string value to a triple of aligned months, days, nanos.
+/// Fractional units must be spilled to smaller units.
+/// Fractional parts of units greater than months are rounded to be an integer number of months,
+/// e.g. '1.5 years' becomes '12 mons + 6 mons', returns (18, 0, 0)
+/// Fractional parts of months, weeks, days, hours, minutes, seconds and milliseconds are computed
+/// to be an integer number of days and nanoseconds, assuming 30 days per month and 24 hours per day,
+/// e.g., '1.75 months' becomes '1 mon + 22 days + 12 hours', returns (1, 22, 12 * `NANOS_PER_HOUR`)
+/// leading field is the default unit. e.g. leading field is `second`, `1` = `1 second`
+fn parse_interval(leading_field: &str, value: &str) -> Result<MonthDayNano, ArrowError> {
+    let mut used_interval_types = 0;
+
+    let mut calculate_from_part = |interval_period_str: &str,
+                                   interval_type: &str|
+     -> Result<(i32, i32, i64), ArrowError> {
+        // @todo It's better to use Decimal in order to protect rounding errors
+        // Wait https://github.com/apache/arrow/pull/9232

Review Comment:
   ```suggestion
           // TODO: Use fixed-point arithmetic to avoid truncation and rounding errors (#3809)
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] tustvold commented on a diff in pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "tustvold (via GitHub)" <gi...@apache.org>.
tustvold commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1126761858


##########
arrow-cast/src/parse.rs:
##########
@@ -445,6 +446,231 @@ impl Parser for Date64Type {
     }
 }
 
+pub(crate) fn parse_interval_year_month(
+    value: &str,
+) -> Result<<IntervalYearMonthType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("years", value)?;
+    if result_days != 0 || result_nanos != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalYearMonth because the value isn't multiple of months"
+        )));
+    }
+    Ok(IntervalYearMonthType::make_value(0, result_months))
+}
+
+pub(crate) fn parse_interval_day_time(
+    value: &str,
+) -> Result<<IntervalDayTimeType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?;
+    if result_nanos % 1_000_000 != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds"
+        )));
+    }
+    result_days += result_months * 30;
+    Ok(IntervalDayTimeType::make_value(
+        result_days,
+        (result_nanos / 1_000_000) as i32,
+    ))
+}
+
+pub(crate) fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("months", value)?;
+    Ok(IntervalMonthDayNanoType::make_value(
+        result_months,
+        result_days,
+        result_nanos,
+    ))
+}
+
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_MILLIS: f64 = 1_000_000_f64;
+const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS;
+const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND;
+const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE;
+const NANOS_PER_DAY: f64 = 24_f64 * NANOS_PER_HOUR;
+
+#[derive(Clone, Copy)]
+#[repr(u16)]
+enum IntervalType {
+    Century = 0b_00_0000_0001,
+    Decade = 0b_00_0000_0010,
+    Year = 0b_00_0000_0100,
+    Month = 0b_00_0000_1000,
+    Week = 0b_00_0001_0000,
+    Day = 0b_00_0010_0000,
+    Hour = 0b_00_0100_0000,
+    Minute = 0b_00_1000_0000,
+    Second = 0b_01_0000_0000,
+    Millisecond = 0b_10_0000_0000,
+}
+
+impl FromStr for IntervalType {
+    type Err = ArrowError;
+
+    fn from_str(s: &str) -> Result<Self, ArrowError> {
+        match s.to_lowercase().as_str() {
+            "century" | "centuries" => Ok(Self::Century),
+            "decade" | "decades" => Ok(Self::Decade),
+            "year" | "years" => Ok(Self::Year),
+            "month" | "months" => Ok(Self::Month),
+            "week" | "weeks" => Ok(Self::Week),
+            "day" | "days" => Ok(Self::Day),
+            "hour" | "hours" => Ok(Self::Hour),
+            "minute" | "minutes" => Ok(Self::Minute),
+            "second" | "seconds" => Ok(Self::Second),
+            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
+            _ => Err(ArrowError::NotYetImplemented(format!(
+                "Unknown interval type: {s}"
+            ))),
+        }
+    }
+}
+
+pub type MonthDayNano = (i32, i32, i64);
+
+/// parse string value to a triple of aligned months, days, nanos
+pub fn parse_interval(
+    leading_field: &str,
+    value: &str,
+) -> Result<MonthDayNano, ArrowError> {
+    let mut used_interval_types = 0;
+
+    let mut calculate_from_part = |interval_period_str: &str,
+                                   interval_type: &str|
+     -> Result<(i64, i64, f64), ArrowError> {
+        // @todo It's better to use Decimal in order to protect rounding errors
+        // Wait https://github.com/apache/arrow/pull/9232
+        let interval_period = match f64::from_str(interval_period_str) {
+            Ok(n) => n,
+            Err(_) => {
+                return Err(ArrowError::NotYetImplemented(format!(
+                    "Unsupported Interval Expression with value {value:?}"
+                )));
+            }
+        };
+
+        if interval_period > (i64::MAX as f64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        let it = IntervalType::from_str(interval_type).map_err(|_| {
+            ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}"
+            ))
+        })?;
+
+        // Disallow duplicate interval types
+        if used_interval_types & (it as u16) != 0 {
+            return Err(ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}. Repeated type '{interval_type}'"
+            )));
+        } else {
+            used_interval_types |= it as u16;
+        }
+
+        match it {
+            IntervalType::Century => {
+                Ok(align_interval_parts(interval_period * 1200_f64, 0.0, 0.0))
+            }
+            IntervalType::Decade => {
+                Ok(align_interval_parts(interval_period * 120_f64, 0.0, 0.0))
+            }
+            IntervalType::Year => {
+                Ok(align_interval_parts(interval_period * 12_f64, 0.0, 0.0))
+            }
+            IntervalType::Month => Ok(align_interval_parts(interval_period, 0.0, 0.0)),
+            IntervalType::Week => {
+                Ok(align_interval_parts(0.0, interval_period * 7_f64, 0.0))
+            }
+            IntervalType::Day => Ok(align_interval_parts(0.0, interval_period, 0.0)),
+            IntervalType::Hour => {
+                Ok((0, 0, interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND))
+            }
+            IntervalType::Minute => {
+                Ok((0, 0, interval_period * 60_f64 * NANOS_PER_SECOND))
+            }
+            IntervalType::Second => Ok((0, 0, interval_period * NANOS_PER_SECOND)),
+            IntervalType::Millisecond => Ok((0, 0, interval_period * 1_000_000f64)),
+        }
+    };
+
+    let mut result_month: i64 = 0;
+    let mut result_days: i64 = 0;
+    let mut result_nanos: i128 = 0;
+
+    let mut parts = value.split_whitespace();
+
+    loop {
+        let interval_period_str = parts.next();
+        if interval_period_str.is_none() {
+            break;
+        }
+
+        let unit = parts.next().unwrap_or(leading_field);
+
+        let (diff_month, diff_days, diff_nanos) =
+            calculate_from_part(interval_period_str.unwrap(), unit)?;
+
+        result_month += diff_month;
+
+        if result_month > (i32::MAX as i64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        result_days += diff_days;
+
+        if result_days > (i32::MAX as i64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        result_nanos += diff_nanos as i128;
+
+        if result_nanos > (i64::MAX as i128) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+    }
+
+    Ok((result_month as i32, result_days as i32, result_nanos as i64))
+}
+
+/// We are storing parts as integers, it's why we need to align parts fractional
+/// INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days
+/// INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours
+/// INTERVAL '30 DAYS' = 1 month
+fn align_interval_parts(
+    mut month_part: f64,
+    mut day_part: f64,
+    mut nanos_part: f64,
+) -> (i64, i64, f64) {
+    // Convert fractional month to days, It's not supported by Arrow types, but anyway
+    day_part += (month_part - (month_part as i64) as f64) * 30_f64;
+
+    // Convert fractional days to hours
+    nanos_part += (day_part - ((day_part as i64) as f64))

Review Comment:
   I can't see a test that has a negative fractional number of days or months? Am I being blind?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] alamb commented on a diff in pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "alamb (via GitHub)" <gi...@apache.org>.
alamb commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1126946887


##########
arrow-cast/src/parse.rs:
##########
@@ -445,6 +446,237 @@ impl Parser for Date64Type {
     }
 }
 
+pub fn parse_interval_year_month(
+    value: &str,
+) -> Result<<IntervalYearMonthType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("years", value)?;
+    if result_days != 0 || result_nanos != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalYearMonth because the value isn't multiple of months"
+        )));
+    }
+    Ok(IntervalYearMonthType::make_value(0, result_months))
+}
+
+pub fn parse_interval_day_time(
+    value: &str,
+) -> Result<<IntervalDayTimeType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?;
+    if result_nanos % 1_000_000 != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds"
+        )));
+    }
+    result_days += result_months * 30;
+    Ok(IntervalDayTimeType::make_value(
+        result_days,
+        (result_nanos / 1_000_000) as i32,
+    ))
+}
+
+pub fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("months", value)?;
+    Ok(IntervalMonthDayNanoType::make_value(
+        result_months,
+        result_days,
+        result_nanos,
+    ))
+}
+
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_MILLIS: f64 = 1_000_000_f64;
+const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS;
+const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND;
+const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE;
+const NANOS_PER_DAY: f64 = 24_f64 * NANOS_PER_HOUR;
+
+#[derive(Clone, Copy)]
+#[repr(u16)]
+enum IntervalType {
+    Century = 0b_00_0000_0001,
+    Decade = 0b_00_0000_0010,
+    Year = 0b_00_0000_0100,
+    Month = 0b_00_0000_1000,
+    Week = 0b_00_0001_0000,
+    Day = 0b_00_0010_0000,
+    Hour = 0b_00_0100_0000,
+    Minute = 0b_00_1000_0000,
+    Second = 0b_01_0000_0000,
+    Millisecond = 0b_10_0000_0000,
+}
+
+impl FromStr for IntervalType {
+    type Err = ArrowError;
+
+    fn from_str(s: &str) -> Result<Self, ArrowError> {
+        match s.to_lowercase().as_str() {
+            "century" | "centuries" => Ok(Self::Century),
+            "decade" | "decades" => Ok(Self::Decade),
+            "year" | "years" => Ok(Self::Year),
+            "month" | "months" => Ok(Self::Month),
+            "week" | "weeks" => Ok(Self::Week),
+            "day" | "days" => Ok(Self::Day),
+            "hour" | "hours" => Ok(Self::Hour),
+            "minute" | "minutes" => Ok(Self::Minute),
+            "second" | "seconds" => Ok(Self::Second),
+            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
+            _ => Err(ArrowError::NotYetImplemented(format!(
+                "Unknown interval type: {s}"
+            ))),
+        }
+    }
+}
+
+pub type MonthDayNano = (i32, i32, i64);
+
+/// parse string value to a triple of aligned months, days, nanos.
+/// Fractional units must be spilled to smaller units.
+/// Fractional parts of units greater than months are rounded to be an integer number of months,
+/// e.g. '1.5 years' becomes '12 mons + 6 mons', returns (18, 0, 0)
+/// Fractional parts of months, weeks, days, hours, minutes, seconds and milliseconds are computed
+/// to be an integer number of days and nanoseconds, assuming 30 days per month and 24 hours per day,
+/// e.g., '1.75 months' becomes '1 mon + 22 days + 12 hours', returns (1, 22, 12 * `NANOS_PER_HOUR`)
+/// leading field is the default unit. e.g. leading field is `second`, `1` = `1 second`
+fn parse_interval(leading_field: &str, value: &str) -> Result<MonthDayNano, ArrowError> {
+    let mut used_interval_types = 0;
+
+    let mut calculate_from_part = |interval_period_str: &str,
+                                   interval_type: &str|
+     -> Result<(i32, i32, i64), ArrowError> {
+        // @todo It's better to use Decimal in order to protect rounding errors
+        // Wait https://github.com/apache/arrow/pull/9232
+        let interval_period = match f64::from_str(interval_period_str) {
+            Ok(n) => n,
+            Err(_) => {
+                return Err(ArrowError::NotYetImplemented(format!(
+                    "Unsupported Interval Expression with value {value:?}"
+                )));
+            }
+        };
+
+        if interval_period > (i64::MAX as f64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        let it = IntervalType::from_str(interval_type).map_err(|_| {
+            ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}"
+            ))
+        })?;
+
+        // Disallow duplicate interval types
+        if used_interval_types & (it as u16) != 0 {
+            return Err(ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}. Repeated type '{interval_type}'"
+            )));
+        } else {
+            used_interval_types |= it as u16;
+        }
+
+        match it {
+            IntervalType::Century => {
+                align_interval_parts(interval_period * 1200_f64, 0.0, 0.0)
+            }
+            IntervalType::Decade => {
+                align_interval_parts(interval_period * 120_f64, 0.0, 0.0)
+            }
+            IntervalType::Year => {
+                align_interval_parts(interval_period * 12_f64, 0.0, 0.0)
+            }
+            IntervalType::Month => align_interval_parts(interval_period, 0.0, 0.0),
+            IntervalType::Week => align_interval_parts(0.0, interval_period * 7_f64, 0.0),
+            IntervalType::Day => align_interval_parts(0.0, interval_period, 0.0),
+            IntervalType::Hour => Ok((
+                0,
+                0,
+                (interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND) as i64,
+            )),
+            IntervalType::Minute => {
+                Ok((0, 0, (interval_period * 60_f64 * NANOS_PER_SECOND) as i64))
+            }
+            IntervalType::Second => {
+                Ok((0, 0, (interval_period * NANOS_PER_SECOND) as i64))
+            }
+            IntervalType::Millisecond => {
+                Ok((0, 0, (interval_period * 1_000_000f64) as i64))
+            }
+        }
+    };
+
+    let mut result_month: i32 = 0;
+    let mut result_days: i32 = 0;
+    let mut result_nanos: i64 = 0;
+
+    let mut parts = value.split_whitespace();
+
+    while let Some(interval_period_str) = parts.next() {
+        let unit = parts.next().unwrap_or(leading_field);
+
+        let (diff_month, diff_days, diff_nanos) =
+            calculate_from_part(interval_period_str, unit)?;
+
+        result_month =
+            result_month
+                .checked_add(diff_month)
+                .ok_or(ArrowError::ParseError(format!(
+                    "Interval field value out of range: {value:?}"
+                )))?;
+
+        result_days =
+            result_days
+                .checked_add(diff_days)
+                .ok_or(ArrowError::ParseError(format!(
+                    "Interval field value out of range: {value:?}"
+                )))?;
+
+        result_nanos =
+            result_nanos
+                .checked_add(diff_nanos)
+                .ok_or(ArrowError::ParseError(format!(
+                    "Interval field value out of range: {value:?}"
+                )))?;
+    }
+
+    Ok((result_month, result_days, result_nanos))
+}
+
+/// We are storing parts as integers, it's why we need to align parts fractional
+/// INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days
+/// INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours
+/// INTERVAL '30 DAYS' = 1 MONTH
+fn align_interval_parts(
+    mut month_part: f64,
+    mut day_part: f64,
+    mut nanos_part: f64,
+) -> Result<(i32, i32, i64), ArrowError> {
+    // Convert fractional month to days, It's not supported by Arrow types, but anyway
+    day_part += (month_part - (month_part as i64) as f64) * 30_f64;
+
+    // Convert fractional days to hours
+    nanos_part += (day_part - ((day_part as i64) as f64))
+        * 24_f64
+        * SECONDS_PER_HOUR
+        * NANOS_PER_SECOND;
+
+    // Convert to higher units as much as possible
+    day_part += ((nanos_part as i64) / (NANOS_PER_DAY as i64)) as f64;
+    month_part += ((day_part as i64) / 30_i64) as f64;
+    nanos_part %= NANOS_PER_DAY;
+    day_part %= 30_f64;

Review Comment:
   I like the idea of removing the "convert to higher units" logic and and file a ticket to (potentially) support it
   
   I am happy to file such a ticket
   
   I agree the idea of `40 days` --> `1 month and 10 days` that this PR will do, doesn't seem correct



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] alamb commented on a diff in pull request #3762: Cast string to interval

Posted by "alamb (via GitHub)" <gi...@apache.org>.
alamb commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1123436418


##########
arrow-cast/src/cast.rs:
##########
@@ -2624,6 +2647,75 @@ fn cast_string_to_timestamp<
     Ok(Arc::new(array) as ArrayRef)
 }
 
+fn cast_string_to_year_month_interval<Offset: OffsetSizeTrait>(
+    array: &dyn Array,
+    cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+    let string_array = array
+        .as_any()
+        .downcast_ref::<GenericStringArray<Offset>>()
+        .unwrap();
+    let interval_array = if cast_options.safe {
+        let iter = string_array
+            .iter()
+            .map(|v| v.and_then(|v| parse_interval_year_month(v).ok()));
+        unsafe { IntervalYearMonthArray::from_trusted_len_iter(iter) }

Review Comment:
   I think we should add some justification for using `unsafe` in the code below (which is that the iterator came from a string_array so it has a known good length)



##########
arrow-cast/src/test_util.rs:
##########
@@ -0,0 +1,64 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Utility functions to make testing DataFusion based crates easier
+
+/// A macro to assert that one string is contained within another with
+/// a nice error message if they are not.
+///
+/// Usage: `assert_contains!(actual, expected)`
+///
+/// Is a macro so test error
+/// messages are on the same line as the failure;
+///
+/// Both arguments must be convertable into Strings ([`Into`]<[`String`]>)
+#[macro_export]
+macro_rules! assert_contains {
+    ($ACTUAL: expr, $EXPECTED: expr) => {
+        let actual_value: String = $ACTUAL.into();
+        let expected_value: String = $EXPECTED.into();
+        assert!(
+            actual_value.contains(&expected_value),
+            "Can not find expected in actual.\n\nExpected:\n{}\n\nActual:\n{}",
+            expected_value,
+            actual_value
+        );
+    };
+}
+
+/// A macro to assert that one string is NOT contained within another with
+/// a nice error message if they are are.
+///
+/// Usage: `assert_not_contains!(actual, unexpected)`
+///
+/// Is a macro so test error
+/// messages are on the same line as the failure;
+///
+/// Both arguments must be convertable into Strings ([`Into`]<[`String`]>)
+#[macro_export]
+macro_rules! assert_not_contains {

Review Comment:
   This does not appear to be used



##########
arrow-cast/src/parse.rs:
##########
@@ -445,9 +446,229 @@ impl Parser for Date64Type {
     }
 }
 
+pub(crate) fn parse_interval_year_month(
+    value: &str,
+) -> Result<<IntervalYearMonthType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("years", value)?;
+    if result_days != 0 || result_nanos != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast ${value} to IntervalYearMonth because the value isn't multiple of months"
+        )));
+    }
+    Ok(IntervalYearMonthType::make_value(0, result_months))
+}
+
+pub(crate) fn parse_interval_day_time(
+    value: &str,
+) -> Result<<IntervalDayTimeType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?;
+    if result_nanos % 1_000_000 != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast ${value} to IntervalDayTime because the nanos part isn't multiple of milliseconds"
+        )));
+    }
+    result_days += result_months * 30;
+    Ok(IntervalDayTimeType::make_value(
+        result_days,
+        (result_nanos / 1_000_000) as i32,
+    ))
+}
+
+pub(crate) fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("months", value)?;
+    Ok(IntervalMonthDayNanoType::make_value(
+        result_months,
+        result_days,
+        result_nanos,
+    ))
+}
+
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_MILLIS: f64 = 1_000_000_f64;
+const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS;
+#[cfg(test)]
+const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND;
+#[cfg(test)]
+const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE;
+
+#[derive(Clone, Copy)]
+#[repr(u16)]
+enum IntervalType {
+    Century = 0b_00_0000_0001,
+    Decade = 0b_00_0000_0010,
+    Year = 0b_00_0000_0100,
+    Month = 0b_00_0000_1000,
+    Week = 0b_00_0001_0000,
+    Day = 0b_00_0010_0000,
+    Hour = 0b_00_0100_0000,
+    Minute = 0b_00_1000_0000,
+    Second = 0b_01_0000_0000,
+    Millisecond = 0b_10_0000_0000,
+}
+
+impl FromStr for IntervalType {
+    type Err = ArrowError;
+
+    fn from_str(s: &str) -> Result<Self, ArrowError> {
+        match s.to_lowercase().as_str() {
+            "century" | "centuries" => Ok(Self::Century),
+            "decade" | "decades" => Ok(Self::Decade),
+            "year" | "years" => Ok(Self::Year),
+            "month" | "months" => Ok(Self::Month),
+            "week" | "weeks" => Ok(Self::Week),
+            "day" | "days" => Ok(Self::Day),
+            "hour" | "hours" => Ok(Self::Hour),
+            "minute" | "minutes" => Ok(Self::Minute),
+            "second" | "seconds" => Ok(Self::Second),
+            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
+            _ => Err(ArrowError::NotYetImplemented(format!(
+                "Unknown interval type: {s}"
+            ))),
+        }
+    }
+}
+
+pub type MonthDayNano = (i32, i32, i64);
+
+/// parse string value to a triple of aligned months, days, nanos
+pub fn parse_interval(
+    leading_field: &str,
+    value: &str,
+) -> Result<MonthDayNano, ArrowError> {
+    let mut used_interval_types = 0;
+
+    let mut calculate_from_part = |interval_period_str: &str,
+                                   interval_type: &str|
+     -> Result<(i64, i64, f64), ArrowError> {
+        // @todo It's better to use Decimal in order to protect rounding errors
+        // Wait https://github.com/apache/arrow/pull/9232
+        let interval_period = match f64::from_str(interval_period_str) {
+            Ok(n) => n,
+            Err(_) => {
+                return Err(ArrowError::NotYetImplemented(format!(
+                    "Unsupported Interval Expression with value {value:?}"
+                )));
+            }
+        };
+
+        if interval_period > (i64::MAX as f64) {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        let it = IntervalType::from_str(interval_type).map_err(|_| {
+            ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}"
+            ))
+        })?;
+
+        // Disallow duplicate interval types
+        if used_interval_types & (it as u16) != 0 {
+            return Err(ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}. Repeated type '{interval_type}'"
+            )));
+        } else {
+            used_interval_types |= it as u16;
+        }
+
+        match it {
+            IntervalType::Century => {
+                Ok(align_interval_parts(interval_period * 1200_f64, 0.0, 0.0))
+            }
+            IntervalType::Decade => {
+                Ok(align_interval_parts(interval_period * 120_f64, 0.0, 0.0))
+            }
+            IntervalType::Year => {
+                Ok(align_interval_parts(interval_period * 12_f64, 0.0, 0.0))
+            }
+            IntervalType::Month => Ok(align_interval_parts(interval_period, 0.0, 0.0)),
+            IntervalType::Week => {
+                Ok(align_interval_parts(0.0, interval_period * 7_f64, 0.0))
+            }
+            IntervalType::Day => Ok(align_interval_parts(0.0, interval_period, 0.0)),
+            IntervalType::Hour => {
+                Ok((0, 0, interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND))
+            }
+            IntervalType::Minute => {
+                Ok((0, 0, interval_period * 60_f64 * NANOS_PER_SECOND))
+            }
+            IntervalType::Second => Ok((0, 0, interval_period * NANOS_PER_SECOND)),
+            IntervalType::Millisecond => Ok((0, 0, interval_period * 1_000_000f64)),
+        }
+    };
+
+    let mut result_month: i64 = 0;
+    let mut result_days: i64 = 0;
+    let mut result_nanos: i128 = 0;
+
+    let mut parts = value.split_whitespace();
+
+    loop {
+        let interval_period_str = parts.next();
+        if interval_period_str.is_none() {
+            break;
+        }
+
+        let unit = parts.next().unwrap_or(leading_field);
+
+        let (diff_month, diff_days, diff_nanos) =
+            calculate_from_part(interval_period_str.unwrap(), unit)?;
+
+        result_month += diff_month;
+
+        if result_month > (i32::MAX as i64) {
+            return Err(ArrowError::NotYetImplemented(format!(

Review Comment:
   I think `ParseError` would be more appropriate for the errors in this function: https://docs.rs/arrow/34.0.0/arrow/error/enum.ArrowError.html#variant.ParseError 



##########
arrow-cast/src/parse.rs:
##########
@@ -871,6 +1092,117 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_parse_interval() {
+        assert_eq!(
+            (1i32, 0i32, 0i64),
+            parse_interval("months", "1 month").unwrap(),
+        );
+
+        assert_eq!(
+            (2i32, 0i32, 0i64),
+            parse_interval("months", "2 month").unwrap(),
+        );
+
+        assert_contains!(
+            parse_interval("months", "1 centurys 1 month")
+                .unwrap_err()
+                .to_string(),
+            r#"Invalid input syntax for type interval: "1 centurys 1 month""#
+        );
+

Review Comment:
   Since this is a single use of `assert_contains` what do you think about simply checking the error directly:
   
   
   ```suggestion
           assert_eq!(
               parse_interval("months", "1 centurys 1 month")
                   .unwrap_err()
                   .to_string(), 
               r#"(ADD DETAILS) Invalid input syntax for type interval: "1 centurys 1 month""#
           );
   
   ```



##########
arrow-cast/src/cast.rs:
##########
@@ -4975,6 +5067,56 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_cast_string_to_interval() {
+        let interval_string_values = vec![
+            Some("1 year 1 month 1 day"),
+            None,
+            Some("1.5 years 13 month 35 days 1.4 milliseconds"),
+            Some("3 days"),
+            Some("8 seconds"),
+            None,
+            Some("1 day 29800 milliseconds"),
+            Some("3 months 1 second"),
+            Some("6 minutes 120 second"),
+            Some("2 years 39 months 9 days 19 hours 1 minute 83 seconds 399222 milliseconds"),
+        ];
+        let string_array =
+            Arc::new(StringArray::from(interval_string_values.clone())) as ArrayRef;
+        let options = CastOptions { safe: false };
+        let array_ref = cast_with_options(
+            &string_array.clone(),
+            &DataType::Interval(IntervalUnit::MonthDayNano),

Review Comment:
   Can we please add coverage for:
   1.  `IntervalUnit::DayTime` and `IntervalUnit::YearMonth` 
   2. converting something that doesn't parse (`"foobar"`) for example, and showing it raises an error with `safe = true`, and is converted to null with `safe = true`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] tustvold commented on a diff in pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "tustvold (via GitHub)" <gi...@apache.org>.
tustvold commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1126185850


##########
arrow-cast/src/parse.rs:
##########
@@ -445,6 +446,231 @@ impl Parser for Date64Type {
     }
 }
 
+pub(crate) fn parse_interval_year_month(
+    value: &str,
+) -> Result<<IntervalYearMonthType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("years", value)?;
+    if result_days != 0 || result_nanos != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalYearMonth because the value isn't multiple of months"
+        )));
+    }
+    Ok(IntervalYearMonthType::make_value(0, result_months))
+}
+
+pub(crate) fn parse_interval_day_time(
+    value: &str,
+) -> Result<<IntervalDayTimeType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?;
+    if result_nanos % 1_000_000 != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds"
+        )));
+    }
+    result_days += result_months * 30;
+    Ok(IntervalDayTimeType::make_value(
+        result_days,
+        (result_nanos / 1_000_000) as i32,
+    ))
+}
+
+pub(crate) fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("months", value)?;
+    Ok(IntervalMonthDayNanoType::make_value(
+        result_months,
+        result_days,
+        result_nanos,
+    ))
+}
+
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_MILLIS: f64 = 1_000_000_f64;
+const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS;
+const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND;
+const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE;
+const NANOS_PER_DAY: f64 = 24_f64 * NANOS_PER_HOUR;
+
+#[derive(Clone, Copy)]
+#[repr(u16)]
+enum IntervalType {
+    Century = 0b_00_0000_0001,
+    Decade = 0b_00_0000_0010,
+    Year = 0b_00_0000_0100,
+    Month = 0b_00_0000_1000,
+    Week = 0b_00_0001_0000,
+    Day = 0b_00_0010_0000,
+    Hour = 0b_00_0100_0000,
+    Minute = 0b_00_1000_0000,
+    Second = 0b_01_0000_0000,
+    Millisecond = 0b_10_0000_0000,
+}
+
+impl FromStr for IntervalType {
+    type Err = ArrowError;
+
+    fn from_str(s: &str) -> Result<Self, ArrowError> {
+        match s.to_lowercase().as_str() {
+            "century" | "centuries" => Ok(Self::Century),
+            "decade" | "decades" => Ok(Self::Decade),
+            "year" | "years" => Ok(Self::Year),
+            "month" | "months" => Ok(Self::Month),
+            "week" | "weeks" => Ok(Self::Week),
+            "day" | "days" => Ok(Self::Day),
+            "hour" | "hours" => Ok(Self::Hour),
+            "minute" | "minutes" => Ok(Self::Minute),
+            "second" | "seconds" => Ok(Self::Second),
+            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
+            _ => Err(ArrowError::NotYetImplemented(format!(
+                "Unknown interval type: {s}"
+            ))),
+        }
+    }
+}
+
+pub type MonthDayNano = (i32, i32, i64);
+
+/// parse string value to a triple of aligned months, days, nanos
+pub fn parse_interval(

Review Comment:
   Could we document this method better, it isn't clear to me how it is meant to be used. In particular `leading_field` is never documented AFAICT



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] doki23 commented on a diff in pull request #3762: Cast string to interval

Posted by "doki23 (via GitHub)" <gi...@apache.org>.
doki23 commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1123998579


##########
arrow-cast/src/parse.rs:
##########
@@ -445,9 +446,229 @@ impl Parser for Date64Type {
     }
 }
 
+pub(crate) fn parse_interval_year_month(
+    value: &str,
+) -> Result<<IntervalYearMonthType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("years", value)?;
+    if result_days != 0 || result_nanos != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast ${value} to IntervalYearMonth because the value isn't multiple of months"
+        )));
+    }
+    Ok(IntervalYearMonthType::make_value(0, result_months))
+}
+
+pub(crate) fn parse_interval_day_time(
+    value: &str,
+) -> Result<<IntervalDayTimeType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?;
+    if result_nanos % 1_000_000 != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast ${value} to IntervalDayTime because the nanos part isn't multiple of milliseconds"
+        )));
+    }
+    result_days += result_months * 30;
+    Ok(IntervalDayTimeType::make_value(
+        result_days,
+        (result_nanos / 1_000_000) as i32,
+    ))
+}
+
+pub(crate) fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("months", value)?;
+    Ok(IntervalMonthDayNanoType::make_value(
+        result_months,
+        result_days,
+        result_nanos,
+    ))
+}
+
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_MILLIS: f64 = 1_000_000_f64;
+const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS;
+#[cfg(test)]
+const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND;
+#[cfg(test)]
+const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE;
+
+#[derive(Clone, Copy)]
+#[repr(u16)]
+enum IntervalType {
+    Century = 0b_00_0000_0001,
+    Decade = 0b_00_0000_0010,
+    Year = 0b_00_0000_0100,
+    Month = 0b_00_0000_1000,
+    Week = 0b_00_0001_0000,
+    Day = 0b_00_0010_0000,
+    Hour = 0b_00_0100_0000,
+    Minute = 0b_00_1000_0000,
+    Second = 0b_01_0000_0000,
+    Millisecond = 0b_10_0000_0000,
+}
+
+impl FromStr for IntervalType {
+    type Err = ArrowError;
+
+    fn from_str(s: &str) -> Result<Self, ArrowError> {
+        match s.to_lowercase().as_str() {
+            "century" | "centuries" => Ok(Self::Century),
+            "decade" | "decades" => Ok(Self::Decade),
+            "year" | "years" => Ok(Self::Year),
+            "month" | "months" => Ok(Self::Month),
+            "week" | "weeks" => Ok(Self::Week),
+            "day" | "days" => Ok(Self::Day),
+            "hour" | "hours" => Ok(Self::Hour),
+            "minute" | "minutes" => Ok(Self::Minute),
+            "second" | "seconds" => Ok(Self::Second),
+            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
+            _ => Err(ArrowError::NotYetImplemented(format!(
+                "Unknown interval type: {s}"
+            ))),
+        }
+    }
+}
+
+pub type MonthDayNano = (i32, i32, i64);
+
+/// parse string value to a triple of aligned months, days, nanos
+pub fn parse_interval(
+    leading_field: &str,
+    value: &str,
+) -> Result<MonthDayNano, ArrowError> {
+    let mut used_interval_types = 0;
+
+    let mut calculate_from_part = |interval_period_str: &str,
+                                   interval_type: &str|
+     -> Result<(i64, i64, f64), ArrowError> {
+        // @todo It's better to use Decimal in order to protect rounding errors
+        // Wait https://github.com/apache/arrow/pull/9232
+        let interval_period = match f64::from_str(interval_period_str) {
+            Ok(n) => n,
+            Err(_) => {
+                return Err(ArrowError::NotYetImplemented(format!(
+                    "Unsupported Interval Expression with value {value:?}"
+                )));
+            }
+        };
+
+        if interval_period > (i64::MAX as f64) {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        let it = IntervalType::from_str(interval_type).map_err(|_| {
+            ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}"
+            ))
+        })?;
+
+        // Disallow duplicate interval types
+        if used_interval_types & (it as u16) != 0 {
+            return Err(ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}. Repeated type '{interval_type}'"
+            )));
+        } else {
+            used_interval_types |= it as u16;
+        }
+
+        match it {
+            IntervalType::Century => {
+                Ok(align_interval_parts(interval_period * 1200_f64, 0.0, 0.0))
+            }
+            IntervalType::Decade => {
+                Ok(align_interval_parts(interval_period * 120_f64, 0.0, 0.0))
+            }
+            IntervalType::Year => {
+                Ok(align_interval_parts(interval_period * 12_f64, 0.0, 0.0))
+            }
+            IntervalType::Month => Ok(align_interval_parts(interval_period, 0.0, 0.0)),
+            IntervalType::Week => {
+                Ok(align_interval_parts(0.0, interval_period * 7_f64, 0.0))
+            }
+            IntervalType::Day => Ok(align_interval_parts(0.0, interval_period, 0.0)),
+            IntervalType::Hour => {
+                Ok((0, 0, interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND))
+            }
+            IntervalType::Minute => {
+                Ok((0, 0, interval_period * 60_f64 * NANOS_PER_SECOND))
+            }
+            IntervalType::Second => Ok((0, 0, interval_period * NANOS_PER_SECOND)),
+            IntervalType::Millisecond => Ok((0, 0, interval_period * 1_000_000f64)),
+        }
+    };
+
+    let mut result_month: i64 = 0;
+    let mut result_days: i64 = 0;
+    let mut result_nanos: i128 = 0;
+
+    let mut parts = value.split_whitespace();
+
+    loop {
+        let interval_period_str = parts.next();
+        if interval_period_str.is_none() {
+            break;
+        }
+
+        let unit = parts.next().unwrap_or(leading_field);
+
+        let (diff_month, diff_days, diff_nanos) =
+            calculate_from_part(interval_period_str.unwrap(), unit)?;
+
+        result_month += diff_month;
+
+        if result_month > (i32::MAX as i64) {
+            return Err(ArrowError::NotYetImplemented(format!(

Review Comment:
   It may be an `InvalidArgumentError`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] doki23 commented on a diff in pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "doki23 (via GitHub)" <gi...@apache.org>.
doki23 commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1126402464


##########
arrow-cast/src/parse.rs:
##########
@@ -445,6 +446,231 @@ impl Parser for Date64Type {
     }
 }
 
+pub(crate) fn parse_interval_year_month(
+    value: &str,
+) -> Result<<IntervalYearMonthType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("years", value)?;
+    if result_days != 0 || result_nanos != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalYearMonth because the value isn't multiple of months"
+        )));
+    }
+    Ok(IntervalYearMonthType::make_value(0, result_months))
+}
+
+pub(crate) fn parse_interval_day_time(
+    value: &str,
+) -> Result<<IntervalDayTimeType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?;
+    if result_nanos % 1_000_000 != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds"
+        )));
+    }
+    result_days += result_months * 30;
+    Ok(IntervalDayTimeType::make_value(
+        result_days,
+        (result_nanos / 1_000_000) as i32,
+    ))
+}
+
+pub(crate) fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("months", value)?;
+    Ok(IntervalMonthDayNanoType::make_value(
+        result_months,
+        result_days,
+        result_nanos,
+    ))
+}
+
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_MILLIS: f64 = 1_000_000_f64;
+const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS;
+const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND;
+const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE;
+const NANOS_PER_DAY: f64 = 24_f64 * NANOS_PER_HOUR;
+
+#[derive(Clone, Copy)]
+#[repr(u16)]
+enum IntervalType {
+    Century = 0b_00_0000_0001,
+    Decade = 0b_00_0000_0010,
+    Year = 0b_00_0000_0100,
+    Month = 0b_00_0000_1000,
+    Week = 0b_00_0001_0000,
+    Day = 0b_00_0010_0000,
+    Hour = 0b_00_0100_0000,
+    Minute = 0b_00_1000_0000,
+    Second = 0b_01_0000_0000,
+    Millisecond = 0b_10_0000_0000,
+}
+
+impl FromStr for IntervalType {
+    type Err = ArrowError;
+
+    fn from_str(s: &str) -> Result<Self, ArrowError> {
+        match s.to_lowercase().as_str() {
+            "century" | "centuries" => Ok(Self::Century),
+            "decade" | "decades" => Ok(Self::Decade),
+            "year" | "years" => Ok(Self::Year),
+            "month" | "months" => Ok(Self::Month),
+            "week" | "weeks" => Ok(Self::Week),
+            "day" | "days" => Ok(Self::Day),
+            "hour" | "hours" => Ok(Self::Hour),
+            "minute" | "minutes" => Ok(Self::Minute),
+            "second" | "seconds" => Ok(Self::Second),
+            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
+            _ => Err(ArrowError::NotYetImplemented(format!(
+                "Unknown interval type: {s}"
+            ))),
+        }
+    }
+}
+
+pub type MonthDayNano = (i32, i32, i64);
+
+/// parse string value to a triple of aligned months, days, nanos
+pub fn parse_interval(
+    leading_field: &str,
+    value: &str,
+) -> Result<MonthDayNano, ArrowError> {
+    let mut used_interval_types = 0;
+
+    let mut calculate_from_part = |interval_period_str: &str,
+                                   interval_type: &str|
+     -> Result<(i64, i64, f64), ArrowError> {
+        // @todo It's better to use Decimal in order to protect rounding errors
+        // Wait https://github.com/apache/arrow/pull/9232
+        let interval_period = match f64::from_str(interval_period_str) {
+            Ok(n) => n,
+            Err(_) => {
+                return Err(ArrowError::NotYetImplemented(format!(
+                    "Unsupported Interval Expression with value {value:?}"
+                )));
+            }
+        };
+
+        if interval_period > (i64::MAX as f64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        let it = IntervalType::from_str(interval_type).map_err(|_| {
+            ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}"
+            ))
+        })?;
+
+        // Disallow duplicate interval types
+        if used_interval_types & (it as u16) != 0 {
+            return Err(ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}. Repeated type '{interval_type}'"
+            )));
+        } else {
+            used_interval_types |= it as u16;
+        }
+
+        match it {
+            IntervalType::Century => {
+                Ok(align_interval_parts(interval_period * 1200_f64, 0.0, 0.0))
+            }
+            IntervalType::Decade => {
+                Ok(align_interval_parts(interval_period * 120_f64, 0.0, 0.0))
+            }
+            IntervalType::Year => {
+                Ok(align_interval_parts(interval_period * 12_f64, 0.0, 0.0))
+            }
+            IntervalType::Month => Ok(align_interval_parts(interval_period, 0.0, 0.0)),
+            IntervalType::Week => {
+                Ok(align_interval_parts(0.0, interval_period * 7_f64, 0.0))
+            }
+            IntervalType::Day => Ok(align_interval_parts(0.0, interval_period, 0.0)),
+            IntervalType::Hour => {
+                Ok((0, 0, interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND))
+            }
+            IntervalType::Minute => {
+                Ok((0, 0, interval_period * 60_f64 * NANOS_PER_SECOND))
+            }
+            IntervalType::Second => Ok((0, 0, interval_period * NANOS_PER_SECOND)),
+            IntervalType::Millisecond => Ok((0, 0, interval_period * 1_000_000f64)),
+        }
+    };
+
+    let mut result_month: i64 = 0;
+    let mut result_days: i64 = 0;
+    let mut result_nanos: i128 = 0;
+
+    let mut parts = value.split_whitespace();
+
+    loop {
+        let interval_period_str = parts.next();
+        if interval_period_str.is_none() {
+            break;
+        }
+
+        let unit = parts.next().unwrap_or(leading_field);
+
+        let (diff_month, diff_days, diff_nanos) =
+            calculate_from_part(interval_period_str.unwrap(), unit)?;
+
+        result_month += diff_month;
+
+        if result_month > (i32::MAX as i64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        result_days += diff_days;
+
+        if result_days > (i32::MAX as i64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        result_nanos += diff_nanos as i128;
+
+        if result_nanos > (i64::MAX as i128) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+    }
+
+    Ok((result_month as i32, result_days as i32, result_nanos as i64))
+}
+
+/// We are storing parts as integers, it's why we need to align parts fractional
+/// INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days
+/// INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours
+/// INTERVAL '30 DAYS' = 1 month
+fn align_interval_parts(
+    mut month_part: f64,
+    mut day_part: f64,
+    mut nanos_part: f64,
+) -> (i64, i64, f64) {
+    // Convert fractional month to days, It's not supported by Arrow types, but anyway

Review Comment:
   According to [this doc](https://www.postgresql.org/docs/current/datatype-datetime.html#DATATYPE-INTERVAL-INPUT:~:text=Field%20values%20can,fractional%20on%20output.) it's correct. But you remind me that maybe it's incorrect to spill smaller units to larger units.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] tustvold commented on a diff in pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "tustvold (via GitHub)" <gi...@apache.org>.
tustvold commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1126758340


##########
arrow-cast/src/parse.rs:
##########
@@ -445,6 +446,231 @@ impl Parser for Date64Type {
     }
 }
 
+pub(crate) fn parse_interval_year_month(
+    value: &str,
+) -> Result<<IntervalYearMonthType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("years", value)?;
+    if result_days != 0 || result_nanos != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalYearMonth because the value isn't multiple of months"
+        )));
+    }
+    Ok(IntervalYearMonthType::make_value(0, result_months))
+}
+
+pub(crate) fn parse_interval_day_time(
+    value: &str,
+) -> Result<<IntervalDayTimeType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?;
+    if result_nanos % 1_000_000 != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds"
+        )));
+    }
+    result_days += result_months * 30;
+    Ok(IntervalDayTimeType::make_value(
+        result_days,
+        (result_nanos / 1_000_000) as i32,
+    ))
+}
+
+pub(crate) fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("months", value)?;
+    Ok(IntervalMonthDayNanoType::make_value(
+        result_months,
+        result_days,
+        result_nanos,
+    ))
+}
+
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_MILLIS: f64 = 1_000_000_f64;
+const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS;
+const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND;
+const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE;
+const NANOS_PER_DAY: f64 = 24_f64 * NANOS_PER_HOUR;
+
+#[derive(Clone, Copy)]
+#[repr(u16)]
+enum IntervalType {
+    Century = 0b_00_0000_0001,
+    Decade = 0b_00_0000_0010,
+    Year = 0b_00_0000_0100,
+    Month = 0b_00_0000_1000,
+    Week = 0b_00_0001_0000,
+    Day = 0b_00_0010_0000,
+    Hour = 0b_00_0100_0000,
+    Minute = 0b_00_1000_0000,
+    Second = 0b_01_0000_0000,
+    Millisecond = 0b_10_0000_0000,
+}
+
+impl FromStr for IntervalType {
+    type Err = ArrowError;
+
+    fn from_str(s: &str) -> Result<Self, ArrowError> {
+        match s.to_lowercase().as_str() {
+            "century" | "centuries" => Ok(Self::Century),
+            "decade" | "decades" => Ok(Self::Decade),
+            "year" | "years" => Ok(Self::Year),
+            "month" | "months" => Ok(Self::Month),
+            "week" | "weeks" => Ok(Self::Week),
+            "day" | "days" => Ok(Self::Day),
+            "hour" | "hours" => Ok(Self::Hour),
+            "minute" | "minutes" => Ok(Self::Minute),
+            "second" | "seconds" => Ok(Self::Second),
+            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
+            _ => Err(ArrowError::NotYetImplemented(format!(
+                "Unknown interval type: {s}"
+            ))),
+        }
+    }
+}
+
+pub type MonthDayNano = (i32, i32, i64);
+
+/// parse string value to a triple of aligned months, days, nanos
+pub fn parse_interval(
+    leading_field: &str,
+    value: &str,
+) -> Result<MonthDayNano, ArrowError> {
+    let mut used_interval_types = 0;
+
+    let mut calculate_from_part = |interval_period_str: &str,
+                                   interval_type: &str|
+     -> Result<(i64, i64, f64), ArrowError> {
+        // @todo It's better to use Decimal in order to protect rounding errors
+        // Wait https://github.com/apache/arrow/pull/9232
+        let interval_period = match f64::from_str(interval_period_str) {
+            Ok(n) => n,
+            Err(_) => {
+                return Err(ArrowError::NotYetImplemented(format!(
+                    "Unsupported Interval Expression with value {value:?}"
+                )));
+            }
+        };
+
+        if interval_period > (i64::MAX as f64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        let it = IntervalType::from_str(interval_type).map_err(|_| {
+            ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}"
+            ))
+        })?;
+
+        // Disallow duplicate interval types
+        if used_interval_types & (it as u16) != 0 {
+            return Err(ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}. Repeated type '{interval_type}'"
+            )));
+        } else {
+            used_interval_types |= it as u16;
+        }
+
+        match it {
+            IntervalType::Century => {
+                Ok(align_interval_parts(interval_period * 1200_f64, 0.0, 0.0))
+            }
+            IntervalType::Decade => {
+                Ok(align_interval_parts(interval_period * 120_f64, 0.0, 0.0))
+            }
+            IntervalType::Year => {
+                Ok(align_interval_parts(interval_period * 12_f64, 0.0, 0.0))
+            }
+            IntervalType::Month => Ok(align_interval_parts(interval_period, 0.0, 0.0)),
+            IntervalType::Week => {
+                Ok(align_interval_parts(0.0, interval_period * 7_f64, 0.0))
+            }
+            IntervalType::Day => Ok(align_interval_parts(0.0, interval_period, 0.0)),
+            IntervalType::Hour => {
+                Ok((0, 0, interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND))
+            }
+            IntervalType::Minute => {
+                Ok((0, 0, interval_period * 60_f64 * NANOS_PER_SECOND))
+            }
+            IntervalType::Second => Ok((0, 0, interval_period * NANOS_PER_SECOND)),
+            IntervalType::Millisecond => Ok((0, 0, interval_period * 1_000_000f64)),
+        }
+    };
+
+    let mut result_month: i64 = 0;
+    let mut result_days: i64 = 0;
+    let mut result_nanos: i128 = 0;
+
+    let mut parts = value.split_whitespace();
+
+    loop {
+        let interval_period_str = parts.next();
+        if interval_period_str.is_none() {
+            break;
+        }
+
+        let unit = parts.next().unwrap_or(leading_field);
+
+        let (diff_month, diff_days, diff_nanos) =
+            calculate_from_part(interval_period_str.unwrap(), unit)?;
+
+        result_month += diff_month;
+
+        if result_month > (i32::MAX as i64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        result_days += diff_days;
+
+        if result_days > (i32::MAX as i64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        result_nanos += diff_nanos as i128;
+
+        if result_nanos > (i64::MAX as i128) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+    }
+
+    Ok((result_month as i32, result_days as i32, result_nanos as i64))
+}
+
+/// We are storing parts as integers, it's why we need to align parts fractional
+/// INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days
+/// INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours
+/// INTERVAL '30 DAYS' = 1 month
+fn align_interval_parts(
+    mut month_part: f64,
+    mut day_part: f64,
+    mut nanos_part: f64,
+) -> (i64, i64, f64) {
+    // Convert fractional month to days, It's not supported by Arrow types, but anyway

Review Comment:
   In particular
   
   > Field values can have fractional parts: for example, '1.5 weeks' or '01:02:03.45'. However, because interval internally stores only three integer units (months, days, microseconds), fractional units must be spilled to smaller units. Fractional parts of units greater than months are rounded to be an integer number of months, e.g. '1.5 years' becomes '1 year 6 mons'. Fractional parts of weeks and days are computed to be an integer number of days and microseconds, assuming 30 days per month and 24 hours per day, e.g., '1.75 months' becomes 1 mon 22 days 12:00:00. Only seconds will ever be shown as fractional on output.
   
   Perhaps we could add a note, with a link?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] tustvold commented on a diff in pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "tustvold (via GitHub)" <gi...@apache.org>.
tustvold commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1126193043


##########
arrow-cast/src/parse.rs:
##########
@@ -445,6 +446,231 @@ impl Parser for Date64Type {
     }
 }
 
+pub(crate) fn parse_interval_year_month(
+    value: &str,
+) -> Result<<IntervalYearMonthType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("years", value)?;
+    if result_days != 0 || result_nanos != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalYearMonth because the value isn't multiple of months"
+        )));
+    }
+    Ok(IntervalYearMonthType::make_value(0, result_months))
+}
+
+pub(crate) fn parse_interval_day_time(
+    value: &str,
+) -> Result<<IntervalDayTimeType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?;
+    if result_nanos % 1_000_000 != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds"
+        )));
+    }
+    result_days += result_months * 30;
+    Ok(IntervalDayTimeType::make_value(
+        result_days,
+        (result_nanos / 1_000_000) as i32,
+    ))
+}
+
+pub(crate) fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("months", value)?;
+    Ok(IntervalMonthDayNanoType::make_value(
+        result_months,
+        result_days,
+        result_nanos,
+    ))
+}
+
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_MILLIS: f64 = 1_000_000_f64;
+const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS;
+const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND;
+const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE;
+const NANOS_PER_DAY: f64 = 24_f64 * NANOS_PER_HOUR;
+
+#[derive(Clone, Copy)]
+#[repr(u16)]
+enum IntervalType {
+    Century = 0b_00_0000_0001,
+    Decade = 0b_00_0000_0010,
+    Year = 0b_00_0000_0100,
+    Month = 0b_00_0000_1000,
+    Week = 0b_00_0001_0000,
+    Day = 0b_00_0010_0000,
+    Hour = 0b_00_0100_0000,
+    Minute = 0b_00_1000_0000,
+    Second = 0b_01_0000_0000,
+    Millisecond = 0b_10_0000_0000,
+}
+
+impl FromStr for IntervalType {
+    type Err = ArrowError;
+
+    fn from_str(s: &str) -> Result<Self, ArrowError> {
+        match s.to_lowercase().as_str() {
+            "century" | "centuries" => Ok(Self::Century),
+            "decade" | "decades" => Ok(Self::Decade),
+            "year" | "years" => Ok(Self::Year),
+            "month" | "months" => Ok(Self::Month),
+            "week" | "weeks" => Ok(Self::Week),
+            "day" | "days" => Ok(Self::Day),
+            "hour" | "hours" => Ok(Self::Hour),
+            "minute" | "minutes" => Ok(Self::Minute),
+            "second" | "seconds" => Ok(Self::Second),
+            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
+            _ => Err(ArrowError::NotYetImplemented(format!(
+                "Unknown interval type: {s}"
+            ))),
+        }
+    }
+}
+
+pub type MonthDayNano = (i32, i32, i64);
+
+/// parse string value to a triple of aligned months, days, nanos
+pub fn parse_interval(
+    leading_field: &str,
+    value: &str,
+) -> Result<MonthDayNano, ArrowError> {
+    let mut used_interval_types = 0;
+
+    let mut calculate_from_part = |interval_period_str: &str,
+                                   interval_type: &str|
+     -> Result<(i64, i64, f64), ArrowError> {

Review Comment:
   Why do the accumulators differ from the return type, i.e. `i32, i32, i64`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] tustvold commented on a diff in pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "tustvold (via GitHub)" <gi...@apache.org>.
tustvold commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1126202998


##########
arrow-cast/src/parse.rs:
##########
@@ -445,6 +446,231 @@ impl Parser for Date64Type {
     }
 }
 
+pub(crate) fn parse_interval_year_month(

Review Comment:
   I personally would be inclined to make these methods public and parse_interval private



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] doki23 commented on a diff in pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "doki23 (via GitHub)" <gi...@apache.org>.
doki23 commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1126718580


##########
arrow-cast/src/parse.rs:
##########
@@ -445,6 +446,231 @@ impl Parser for Date64Type {
     }
 }
 
+pub(crate) fn parse_interval_year_month(
+    value: &str,
+) -> Result<<IntervalYearMonthType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("years", value)?;
+    if result_days != 0 || result_nanos != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalYearMonth because the value isn't multiple of months"
+        )));
+    }
+    Ok(IntervalYearMonthType::make_value(0, result_months))
+}
+
+pub(crate) fn parse_interval_day_time(
+    value: &str,
+) -> Result<<IntervalDayTimeType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?;
+    if result_nanos % 1_000_000 != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds"
+        )));
+    }
+    result_days += result_months * 30;
+    Ok(IntervalDayTimeType::make_value(
+        result_days,
+        (result_nanos / 1_000_000) as i32,
+    ))
+}
+
+pub(crate) fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("months", value)?;
+    Ok(IntervalMonthDayNanoType::make_value(
+        result_months,
+        result_days,
+        result_nanos,
+    ))
+}
+
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_MILLIS: f64 = 1_000_000_f64;
+const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS;
+const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND;
+const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE;
+const NANOS_PER_DAY: f64 = 24_f64 * NANOS_PER_HOUR;
+
+#[derive(Clone, Copy)]
+#[repr(u16)]
+enum IntervalType {
+    Century = 0b_00_0000_0001,
+    Decade = 0b_00_0000_0010,
+    Year = 0b_00_0000_0100,
+    Month = 0b_00_0000_1000,
+    Week = 0b_00_0001_0000,
+    Day = 0b_00_0010_0000,
+    Hour = 0b_00_0100_0000,
+    Minute = 0b_00_1000_0000,
+    Second = 0b_01_0000_0000,
+    Millisecond = 0b_10_0000_0000,
+}
+
+impl FromStr for IntervalType {
+    type Err = ArrowError;
+
+    fn from_str(s: &str) -> Result<Self, ArrowError> {
+        match s.to_lowercase().as_str() {
+            "century" | "centuries" => Ok(Self::Century),
+            "decade" | "decades" => Ok(Self::Decade),
+            "year" | "years" => Ok(Self::Year),
+            "month" | "months" => Ok(Self::Month),
+            "week" | "weeks" => Ok(Self::Week),
+            "day" | "days" => Ok(Self::Day),
+            "hour" | "hours" => Ok(Self::Hour),
+            "minute" | "minutes" => Ok(Self::Minute),
+            "second" | "seconds" => Ok(Self::Second),
+            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
+            _ => Err(ArrowError::NotYetImplemented(format!(
+                "Unknown interval type: {s}"
+            ))),
+        }
+    }
+}
+
+pub type MonthDayNano = (i32, i32, i64);
+
+/// parse string value to a triple of aligned months, days, nanos
+pub fn parse_interval(
+    leading_field: &str,
+    value: &str,
+) -> Result<MonthDayNano, ArrowError> {
+    let mut used_interval_types = 0;
+
+    let mut calculate_from_part = |interval_period_str: &str,
+                                   interval_type: &str|
+     -> Result<(i64, i64, f64), ArrowError> {
+        // @todo It's better to use Decimal in order to protect rounding errors
+        // Wait https://github.com/apache/arrow/pull/9232
+        let interval_period = match f64::from_str(interval_period_str) {
+            Ok(n) => n,
+            Err(_) => {
+                return Err(ArrowError::NotYetImplemented(format!(
+                    "Unsupported Interval Expression with value {value:?}"
+                )));
+            }
+        };
+
+        if interval_period > (i64::MAX as f64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        let it = IntervalType::from_str(interval_type).map_err(|_| {
+            ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}"
+            ))
+        })?;
+
+        // Disallow duplicate interval types
+        if used_interval_types & (it as u16) != 0 {
+            return Err(ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}. Repeated type '{interval_type}'"
+            )));
+        } else {
+            used_interval_types |= it as u16;
+        }
+
+        match it {
+            IntervalType::Century => {
+                Ok(align_interval_parts(interval_period * 1200_f64, 0.0, 0.0))
+            }
+            IntervalType::Decade => {
+                Ok(align_interval_parts(interval_period * 120_f64, 0.0, 0.0))
+            }
+            IntervalType::Year => {
+                Ok(align_interval_parts(interval_period * 12_f64, 0.0, 0.0))
+            }
+            IntervalType::Month => Ok(align_interval_parts(interval_period, 0.0, 0.0)),
+            IntervalType::Week => {
+                Ok(align_interval_parts(0.0, interval_period * 7_f64, 0.0))
+            }
+            IntervalType::Day => Ok(align_interval_parts(0.0, interval_period, 0.0)),
+            IntervalType::Hour => {
+                Ok((0, 0, interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND))
+            }
+            IntervalType::Minute => {
+                Ok((0, 0, interval_period * 60_f64 * NANOS_PER_SECOND))
+            }
+            IntervalType::Second => Ok((0, 0, interval_period * NANOS_PER_SECOND)),
+            IntervalType::Millisecond => Ok((0, 0, interval_period * 1_000_000f64)),
+        }
+    };
+
+    let mut result_month: i64 = 0;
+    let mut result_days: i64 = 0;
+    let mut result_nanos: i128 = 0;
+
+    let mut parts = value.split_whitespace();
+
+    loop {
+        let interval_period_str = parts.next();
+        if interval_period_str.is_none() {
+            break;
+        }
+
+        let unit = parts.next().unwrap_or(leading_field);
+
+        let (diff_month, diff_days, diff_nanos) =
+            calculate_from_part(interval_period_str.unwrap(), unit)?;
+
+        result_month += diff_month;
+
+        if result_month > (i32::MAX as i64) {

Review Comment:
   @tustvold  Thank you. I've followed your suggestions but not sure I've done it right, PTAL again.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [arrow-rs] tustvold commented on a diff in pull request #3762: Support for casting `Utf8` and `LargeUtf8` --> `Interval`

Posted by "tustvold (via GitHub)" <gi...@apache.org>.
tustvold commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1126805189


##########
arrow-cast/src/parse.rs:
##########
@@ -445,6 +446,237 @@ impl Parser for Date64Type {
     }
 }
 
+pub fn parse_interval_year_month(
+    value: &str,
+) -> Result<<IntervalYearMonthType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("years", value)?;
+    if result_days != 0 || result_nanos != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalYearMonth because the value isn't multiple of months"
+        )));
+    }
+    Ok(IntervalYearMonthType::make_value(0, result_months))
+}
+
+pub fn parse_interval_day_time(
+    value: &str,
+) -> Result<<IntervalDayTimeType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, mut result_days, result_nanos) = parse_interval("days", value)?;
+    if result_nanos % 1_000_000 != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast {value} to IntervalDayTime because the nanos part isn't multiple of milliseconds"
+        )));
+    }
+    result_days += result_months * 30;
+    Ok(IntervalDayTimeType::make_value(
+        result_days,
+        (result_nanos / 1_000_000) as i32,
+    ))
+}
+
+pub fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("months", value)?;
+    Ok(IntervalMonthDayNanoType::make_value(
+        result_months,
+        result_days,
+        result_nanos,
+    ))
+}
+
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_MILLIS: f64 = 1_000_000_f64;
+const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS;
+const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND;
+const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE;
+const NANOS_PER_DAY: f64 = 24_f64 * NANOS_PER_HOUR;
+
+#[derive(Clone, Copy)]
+#[repr(u16)]
+enum IntervalType {
+    Century = 0b_00_0000_0001,
+    Decade = 0b_00_0000_0010,
+    Year = 0b_00_0000_0100,
+    Month = 0b_00_0000_1000,
+    Week = 0b_00_0001_0000,
+    Day = 0b_00_0010_0000,
+    Hour = 0b_00_0100_0000,
+    Minute = 0b_00_1000_0000,
+    Second = 0b_01_0000_0000,
+    Millisecond = 0b_10_0000_0000,
+}
+
+impl FromStr for IntervalType {
+    type Err = ArrowError;
+
+    fn from_str(s: &str) -> Result<Self, ArrowError> {
+        match s.to_lowercase().as_str() {
+            "century" | "centuries" => Ok(Self::Century),
+            "decade" | "decades" => Ok(Self::Decade),
+            "year" | "years" => Ok(Self::Year),
+            "month" | "months" => Ok(Self::Month),
+            "week" | "weeks" => Ok(Self::Week),
+            "day" | "days" => Ok(Self::Day),
+            "hour" | "hours" => Ok(Self::Hour),
+            "minute" | "minutes" => Ok(Self::Minute),
+            "second" | "seconds" => Ok(Self::Second),
+            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
+            _ => Err(ArrowError::NotYetImplemented(format!(
+                "Unknown interval type: {s}"
+            ))),
+        }
+    }
+}
+
+pub type MonthDayNano = (i32, i32, i64);
+
+/// parse string value to a triple of aligned months, days, nanos.
+/// Fractional units must be spilled to smaller units.
+/// Fractional parts of units greater than months are rounded to be an integer number of months,
+/// e.g. '1.5 years' becomes '12 mons + 6 mons', returns (18, 0, 0)
+/// Fractional parts of months, weeks, days, hours, minutes, seconds and milliseconds are computed
+/// to be an integer number of days and nanoseconds, assuming 30 days per month and 24 hours per day,
+/// e.g., '1.75 months' becomes '1 mon + 22 days + 12 hours', returns (1, 22, 12 * `NANOS_PER_HOUR`)
+/// leading field is the default unit. e.g. leading field is `second`, `1` = `1 second`
+fn parse_interval(leading_field: &str, value: &str) -> Result<MonthDayNano, ArrowError> {
+    let mut used_interval_types = 0;
+
+    let mut calculate_from_part = |interval_period_str: &str,
+                                   interval_type: &str|
+     -> Result<(i32, i32, i64), ArrowError> {
+        // @todo It's better to use Decimal in order to protect rounding errors
+        // Wait https://github.com/apache/arrow/pull/9232
+        let interval_period = match f64::from_str(interval_period_str) {
+            Ok(n) => n,
+            Err(_) => {
+                return Err(ArrowError::NotYetImplemented(format!(
+                    "Unsupported Interval Expression with value {value:?}"
+                )));
+            }
+        };
+
+        if interval_period > (i64::MAX as f64) {
+            return Err(ArrowError::ParseError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        let it = IntervalType::from_str(interval_type).map_err(|_| {
+            ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}"
+            ))
+        })?;
+
+        // Disallow duplicate interval types
+        if used_interval_types & (it as u16) != 0 {
+            return Err(ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}. Repeated type '{interval_type}'"
+            )));
+        } else {
+            used_interval_types |= it as u16;
+        }
+
+        match it {
+            IntervalType::Century => {
+                align_interval_parts(interval_period * 1200_f64, 0.0, 0.0)
+            }
+            IntervalType::Decade => {
+                align_interval_parts(interval_period * 120_f64, 0.0, 0.0)
+            }
+            IntervalType::Year => {
+                align_interval_parts(interval_period * 12_f64, 0.0, 0.0)
+            }
+            IntervalType::Month => align_interval_parts(interval_period, 0.0, 0.0),
+            IntervalType::Week => align_interval_parts(0.0, interval_period * 7_f64, 0.0),
+            IntervalType::Day => align_interval_parts(0.0, interval_period, 0.0),
+            IntervalType::Hour => Ok((
+                0,
+                0,
+                (interval_period * SECONDS_PER_HOUR * NANOS_PER_SECOND) as i64,
+            )),
+            IntervalType::Minute => {
+                Ok((0, 0, (interval_period * 60_f64 * NANOS_PER_SECOND) as i64))
+            }
+            IntervalType::Second => {
+                Ok((0, 0, (interval_period * NANOS_PER_SECOND) as i64))
+            }
+            IntervalType::Millisecond => {
+                Ok((0, 0, (interval_period * 1_000_000f64) as i64))
+            }
+        }
+    };
+
+    let mut result_month: i32 = 0;
+    let mut result_days: i32 = 0;
+    let mut result_nanos: i64 = 0;
+
+    let mut parts = value.split_whitespace();
+
+    while let Some(interval_period_str) = parts.next() {
+        let unit = parts.next().unwrap_or(leading_field);
+
+        let (diff_month, diff_days, diff_nanos) =
+            calculate_from_part(interval_period_str, unit)?;
+
+        result_month =
+            result_month
+                .checked_add(diff_month)
+                .ok_or(ArrowError::ParseError(format!(
+                    "Interval field value out of range: {value:?}"
+                )))?;
+
+        result_days =
+            result_days
+                .checked_add(diff_days)
+                .ok_or(ArrowError::ParseError(format!(
+                    "Interval field value out of range: {value:?}"
+                )))?;
+
+        result_nanos =
+            result_nanos
+                .checked_add(diff_nanos)
+                .ok_or(ArrowError::ParseError(format!(
+                    "Interval field value out of range: {value:?}"
+                )))?;
+    }
+
+    Ok((result_month, result_days, result_nanos))
+}
+
+/// We are storing parts as integers, it's why we need to align parts fractional
+/// INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days
+/// INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours
+/// INTERVAL '30 DAYS' = 1 MONTH
+fn align_interval_parts(
+    mut month_part: f64,
+    mut day_part: f64,
+    mut nanos_part: f64,
+) -> Result<(i32, i32, i64), ArrowError> {
+    // Convert fractional month to days, It's not supported by Arrow types, but anyway
+    day_part += (month_part - (month_part as i64) as f64) * 30_f64;
+
+    // Convert fractional days to hours
+    nanos_part += (day_part - ((day_part as i64) as f64))
+        * 24_f64
+        * SECONDS_PER_HOUR
+        * NANOS_PER_SECOND;
+
+    // Convert to higher units as much as possible
+    day_part += ((nanos_part as i64) / (NANOS_PER_DAY as i64)) as f64;
+    month_part += ((day_part as i64) / 30_i64) as f64;
+    nanos_part %= NANOS_PER_DAY;
+    day_part %= 30_f64;

Review Comment:
   ```suggestion
   ```
   I would suggest removing this, not only is it potentially incorrect as months don't have a fixed number of days, but also integer division is very slow



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org