You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/11/15 22:10:41 UTC
[arrow-rs] branch master updated: Parse Time32/Time64 from formatted string (#3101)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new c95eb4c80 Parse Time32/Time64 from formatted string (#3101)
c95eb4c80 is described below
commit c95eb4c80a532653bc91e04e78814f1282c8d005
Author: Jeffrey <22...@users.noreply.github.com>
AuthorDate: Wed Nov 16 09:10:36 2022 +1100
Parse Time32/Time64 from formatted string (#3101)
* Parse Time32/Time64 from formatted string
* PR comments
* PR comments refactoring
---
arrow-cast/src/parse.rs | 420 +++++++++++++++++++++++++++++++++++++++++++++++-
arrow-csv/src/reader.rs | 35 ++++
2 files changed, 451 insertions(+), 4 deletions(-)
diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs
index b93d6c800..6de336351 100644
--- a/arrow-cast/src/parse.rs
+++ b/arrow-cast/src/parse.rs
@@ -132,6 +132,97 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result<i64, ArrowError> {
)))
}
+/// Accepts a string in ISO8601 standard format and some
+/// variants and converts it to nanoseconds since midnight.
+///
+/// Examples of accepted inputs:
+/// * `09:26:56.123 AM`
+/// * `23:59:59`
+/// * `6:00 pm`
+//
+/// Internally, this function uses the `chrono` library for the
+/// time parsing
+///
+/// ## Timezone / Offset Handling
+///
+/// This function does not support parsing strings with a timezone
+/// or offset specified, as it considers only time since midnight.
+pub fn string_to_time_nanoseconds(s: &str) -> Result<i64, ArrowError> {
+ // colon count, presence of decimal, presence of whitespace
+ fn preprocess_time_string(string: &str) -> (usize, bool, bool) {
+ string
+ .as_bytes()
+ .iter()
+ .fold((0, false, false), |tup, char| match char {
+ b':' => (tup.0 + 1, tup.1, tup.2),
+ b'.' => (tup.0, true, tup.2),
+ b' ' => (tup.0, tup.1, true),
+ _ => tup,
+ })
+ }
+
+ // Do a preprocess pass of the string to prune which formats to attempt parsing for
+ let formats: &[&str] = match preprocess_time_string(s.trim()) {
+ // 24-hour clock, with hour, minutes, seconds and fractions of a second specified
+ // Examples:
+ // * 09:50:12.123456789
+ // * 9:50:12.123456789
+ (2, true, false) => &["%H:%M:%S%.f", "%k:%M:%S%.f"],
+
+ // 12-hour clock, with hour, minutes, seconds and fractions of a second specified
+ // Examples:
+ // * 09:50:12.123456789 PM
+ // * 09:50:12.123456789 pm
+ // * 9:50:12.123456789 AM
+ // * 9:50:12.123456789 am
+ (2, true, true) => &[
+ "%I:%M:%S%.f %P",
+ "%I:%M:%S%.f %p",
+ "%l:%M:%S%.f %P",
+ "%l:%M:%S%.f %p",
+ ],
+
+ // 24-hour clock, with hour, minutes and seconds specified
+ // Examples:
+ // * 09:50:12
+ // * 9:50:12
+ (2, false, false) => &["%H:%M:%S", "%k:%M:%S"],
+
+ // 12-hour clock, with hour, minutes and seconds specified
+ // Examples:
+ // * 09:50:12 PM
+ // * 09:50:12 pm
+ // * 9:50:12 AM
+ // * 9:50:12 am
+ (2, false, true) => &["%I:%M:%S %P", "%I:%M:%S %p", "%l:%M:%S %P", "%l:%M:%S %p"],
+
+ // 24-hour clock, with hour and minutes specified
+ // Examples:
+ // * 09:50
+ // * 9:50
+ (1, false, false) => &["%H:%M", "%k:%M"],
+
+ // 12-hour clock, with hour and minutes specified
+ // Examples:
+ // * 09:50 PM
+ // * 09:50 pm
+ // * 9:50 AM
+ // * 9:50 am
+ (1, false, true) => &["%I:%M %P", "%I:%M %p", "%l:%M %P", "%l:%M %p"],
+
+ _ => &[],
+ };
+
+ formats
+ .iter()
+ .find_map(|f| NaiveTime::parse_from_str(s, f).ok())
+ .map(|nt| {
+ nt.num_seconds_from_midnight() as i64 * 1_000_000_000 + nt.nanosecond() as i64
+ })
+ // Return generic error if failed to parse as unknown which format user intended for the string
+ .ok_or_else(|| ArrowError::CastError(format!("Error parsing '{}' as time", s)))
+}
+
/// Specialized parsing implementations
/// used by csv and json reader
pub trait Parser: ArrowPrimitiveType {
@@ -199,10 +290,76 @@ impl Parser for TimestampSecondType {
}
}
-parser_primitive!(Time64NanosecondType);
-parser_primitive!(Time64MicrosecondType);
-parser_primitive!(Time32MillisecondType);
-parser_primitive!(Time32SecondType);
+impl Parser for Time64NanosecondType {
+ // Will truncate any fractions of a nanosecond
+ fn parse(string: &str) -> Option<Self::Native> {
+ string_to_time_nanoseconds(string)
+ .ok()
+ .or_else(|| string.parse::<Self::Native>().ok())
+ }
+
+ fn parse_formatted(string: &str, format: &str) -> Option<Self::Native> {
+ let nt = NaiveTime::parse_from_str(string, format).ok()?;
+ Some(
+ nt.num_seconds_from_midnight() as i64 * 1_000_000_000
+ + nt.nanosecond() as i64,
+ )
+ }
+}
+
+impl Parser for Time64MicrosecondType {
+ // Will truncate any fractions of a microsecond
+ fn parse(string: &str) -> Option<Self::Native> {
+ string_to_time_nanoseconds(string)
+ .ok()
+ .map(|nanos| nanos / 1_000)
+ .or_else(|| string.parse::<Self::Native>().ok())
+ }
+
+ fn parse_formatted(string: &str, format: &str) -> Option<Self::Native> {
+ let nt = NaiveTime::parse_from_str(string, format).ok()?;
+ Some(
+ nt.num_seconds_from_midnight() as i64 * 1_000_000
+ + nt.nanosecond() as i64 / 1_000,
+ )
+ }
+}
+
+impl Parser for Time32MillisecondType {
+ // Will truncate any fractions of a millisecond
+ fn parse(string: &str) -> Option<Self::Native> {
+ string_to_time_nanoseconds(string)
+ .ok()
+ .map(|nanos| (nanos / 1_000_000) as i32)
+ .or_else(|| string.parse::<Self::Native>().ok())
+ }
+
+ fn parse_formatted(string: &str, format: &str) -> Option<Self::Native> {
+ let nt = NaiveTime::parse_from_str(string, format).ok()?;
+ Some(
+ nt.num_seconds_from_midnight() as i32 * 1_000
+ + nt.nanosecond() as i32 / 1_000_000,
+ )
+ }
+}
+
+impl Parser for Time32SecondType {
+ // Will truncate any fractions of a second
+ fn parse(string: &str) -> Option<Self::Native> {
+ string_to_time_nanoseconds(string)
+ .ok()
+ .map(|nanos| (nanos / 1_000_000_000) as i32)
+ .or_else(|| string.parse::<Self::Native>().ok())
+ }
+
+ fn parse_formatted(string: &str, format: &str) -> Option<Self::Native> {
+ let nt = NaiveTime::parse_from_str(string, format).ok()?;
+ Some(
+ nt.num_seconds_from_midnight() as i32
+ + nt.nanosecond() as i32 / 1_000_000_000,
+ )
+ }
+}
/// Number of days between 0001-01-01 and 1970-01-01
const EPOCH_DAYS_FROM_CE: i32 = 719_163;
@@ -411,4 +568,259 @@ mod tests {
parse_timestamp("2020-09-08 13:42:29").unwrap()
);
}
+
+ #[test]
+ fn parse_time64_nanos() {
+ assert_eq!(
+ Time64NanosecondType::parse("02:10:01.1234567899999999"),
+ Some(7_801_123_456_789)
+ );
+ assert_eq!(
+ Time64NanosecondType::parse("02:10:01.1234567"),
+ Some(7_801_123_456_700)
+ );
+ assert_eq!(
+ Time64NanosecondType::parse("2:10:01.1234567"),
+ Some(7_801_123_456_700)
+ );
+ assert_eq!(
+ Time64NanosecondType::parse("12:10:01.123456789 AM"),
+ Some(601_123_456_789)
+ );
+ assert_eq!(
+ Time64NanosecondType::parse("12:10:01.123456789 am"),
+ Some(601_123_456_789)
+ );
+ assert_eq!(
+ Time64NanosecondType::parse("2:10:01.12345678 PM"),
+ Some(51_001_123_456_780)
+ );
+ assert_eq!(
+ Time64NanosecondType::parse("2:10:01.12345678 pm"),
+ Some(51_001_123_456_780)
+ );
+ assert_eq!(
+ Time64NanosecondType::parse("02:10:01"),
+ Some(7_801_000_000_000)
+ );
+ assert_eq!(
+ Time64NanosecondType::parse("2:10:01"),
+ Some(7_801_000_000_000)
+ );
+ assert_eq!(
+ Time64NanosecondType::parse("12:10:01 AM"),
+ Some(601_000_000_000)
+ );
+ assert_eq!(
+ Time64NanosecondType::parse("12:10:01 am"),
+ Some(601_000_000_000)
+ );
+ assert_eq!(
+ Time64NanosecondType::parse("2:10:01 PM"),
+ Some(51_001_000_000_000)
+ );
+ assert_eq!(
+ Time64NanosecondType::parse("2:10:01 pm"),
+ Some(51_001_000_000_000)
+ );
+ assert_eq!(
+ Time64NanosecondType::parse("02:10"),
+ Some(7_800_000_000_000)
+ );
+ assert_eq!(Time64NanosecondType::parse("2:10"), Some(7_800_000_000_000));
+ assert_eq!(
+ Time64NanosecondType::parse("12:10 AM"),
+ Some(600_000_000_000)
+ );
+ assert_eq!(
+ Time64NanosecondType::parse("12:10 am"),
+ Some(600_000_000_000)
+ );
+ assert_eq!(
+ Time64NanosecondType::parse("2:10 PM"),
+ Some(51_000_000_000_000)
+ );
+ assert_eq!(
+ Time64NanosecondType::parse("2:10 pm"),
+ Some(51_000_000_000_000)
+ );
+
+ // parse directly as nanoseconds
+ assert_eq!(Time64NanosecondType::parse("1"), Some(1));
+
+ // leap second
+ assert_eq!(
+ Time64NanosecondType::parse("23:59:60"),
+ Some(86_400_000_000_000)
+ );
+
+ // custom format
+ assert_eq!(
+ Time64NanosecondType::parse_formatted(
+ "02 - 10 - 01 - .1234567",
+ "%H - %M - %S - %.f"
+ ),
+ Some(7_801_123_456_700)
+ );
+ }
+
+ #[test]
+ fn parse_time64_micros() {
+ // expected formats
+ assert_eq!(
+ Time64MicrosecondType::parse("02:10:01.1234"),
+ Some(7_801_123_400)
+ );
+ assert_eq!(
+ Time64MicrosecondType::parse("2:10:01.1234"),
+ Some(7_801_123_400)
+ );
+ assert_eq!(
+ Time64MicrosecondType::parse("12:10:01.123456 AM"),
+ Some(601_123_456)
+ );
+ assert_eq!(
+ Time64MicrosecondType::parse("12:10:01.123456 am"),
+ Some(601_123_456)
+ );
+ assert_eq!(
+ Time64MicrosecondType::parse("2:10:01.12345 PM"),
+ Some(51_001_123_450)
+ );
+ assert_eq!(
+ Time64MicrosecondType::parse("2:10:01.12345 pm"),
+ Some(51_001_123_450)
+ );
+ assert_eq!(
+ Time64MicrosecondType::parse("02:10:01"),
+ Some(7_801_000_000)
+ );
+ assert_eq!(Time64MicrosecondType::parse("2:10:01"), Some(7_801_000_000));
+ assert_eq!(
+ Time64MicrosecondType::parse("12:10:01 AM"),
+ Some(601_000_000)
+ );
+ assert_eq!(
+ Time64MicrosecondType::parse("12:10:01 am"),
+ Some(601_000_000)
+ );
+ assert_eq!(
+ Time64MicrosecondType::parse("2:10:01 PM"),
+ Some(51_001_000_000)
+ );
+ assert_eq!(
+ Time64MicrosecondType::parse("2:10:01 pm"),
+ Some(51_001_000_000)
+ );
+ assert_eq!(Time64MicrosecondType::parse("02:10"), Some(7_800_000_000));
+ assert_eq!(Time64MicrosecondType::parse("2:10"), Some(7_800_000_000));
+ assert_eq!(Time64MicrosecondType::parse("12:10 AM"), Some(600_000_000));
+ assert_eq!(Time64MicrosecondType::parse("12:10 am"), Some(600_000_000));
+ assert_eq!(
+ Time64MicrosecondType::parse("2:10 PM"),
+ Some(51_000_000_000)
+ );
+ assert_eq!(
+ Time64MicrosecondType::parse("2:10 pm"),
+ Some(51_000_000_000)
+ );
+
+ // parse directly as microseconds
+ assert_eq!(Time64MicrosecondType::parse("1"), Some(1));
+
+ // leap second
+ assert_eq!(
+ Time64MicrosecondType::parse("23:59:60"),
+ Some(86_400_000_000)
+ );
+
+ // custom format
+ assert_eq!(
+ Time64MicrosecondType::parse_formatted(
+ "02 - 10 - 01 - .1234",
+ "%H - %M - %S - %.f"
+ ),
+ Some(7_801_123_400)
+ );
+ }
+
+ #[test]
+ fn parse_time32_millis() {
+ // expected formats
+ assert_eq!(Time32MillisecondType::parse("02:10:01.1"), Some(7_801_100));
+ assert_eq!(Time32MillisecondType::parse("2:10:01.1"), Some(7_801_100));
+ assert_eq!(
+ Time32MillisecondType::parse("12:10:01.123 AM"),
+ Some(601_123)
+ );
+ assert_eq!(
+ Time32MillisecondType::parse("12:10:01.123 am"),
+ Some(601_123)
+ );
+ assert_eq!(
+ Time32MillisecondType::parse("2:10:01.12 PM"),
+ Some(51_001_120)
+ );
+ assert_eq!(
+ Time32MillisecondType::parse("2:10:01.12 pm"),
+ Some(51_001_120)
+ );
+ assert_eq!(Time32MillisecondType::parse("02:10:01"), Some(7_801_000));
+ assert_eq!(Time32MillisecondType::parse("2:10:01"), Some(7_801_000));
+ assert_eq!(Time32MillisecondType::parse("12:10:01 AM"), Some(601_000));
+ assert_eq!(Time32MillisecondType::parse("12:10:01 am"), Some(601_000));
+ assert_eq!(Time32MillisecondType::parse("2:10:01 PM"), Some(51_001_000));
+ assert_eq!(Time32MillisecondType::parse("2:10:01 pm"), Some(51_001_000));
+ assert_eq!(Time32MillisecondType::parse("02:10"), Some(7_800_000));
+ assert_eq!(Time32MillisecondType::parse("2:10"), Some(7_800_000));
+ assert_eq!(Time32MillisecondType::parse("12:10 AM"), Some(600_000));
+ assert_eq!(Time32MillisecondType::parse("12:10 am"), Some(600_000));
+ assert_eq!(Time32MillisecondType::parse("2:10 PM"), Some(51_000_000));
+ assert_eq!(Time32MillisecondType::parse("2:10 pm"), Some(51_000_000));
+
+ // parse directly as milliseconds
+ assert_eq!(Time32MillisecondType::parse("1"), Some(1));
+
+ // leap second
+ assert_eq!(Time32MillisecondType::parse("23:59:60"), Some(86_400_000));
+
+ // custom format
+ assert_eq!(
+ Time32MillisecondType::parse_formatted(
+ "02 - 10 - 01 - .1",
+ "%H - %M - %S - %.f"
+ ),
+ Some(7_801_100)
+ );
+ }
+
+ #[test]
+ fn parse_time32_secs() {
+ // expected formats
+ assert_eq!(Time32SecondType::parse("02:10:01.1"), Some(7_801));
+ assert_eq!(Time32SecondType::parse("02:10:01"), Some(7_801));
+ assert_eq!(Time32SecondType::parse("2:10:01"), Some(7_801));
+ assert_eq!(Time32SecondType::parse("12:10:01 AM"), Some(601));
+ assert_eq!(Time32SecondType::parse("12:10:01 am"), Some(601));
+ assert_eq!(Time32SecondType::parse("2:10:01 PM"), Some(51_001));
+ assert_eq!(Time32SecondType::parse("2:10:01 pm"), Some(51_001));
+ assert_eq!(Time32SecondType::parse("02:10"), Some(7_800));
+ assert_eq!(Time32SecondType::parse("2:10"), Some(7_800));
+ assert_eq!(Time32SecondType::parse("12:10 AM"), Some(600));
+ assert_eq!(Time32SecondType::parse("12:10 am"), Some(600));
+ assert_eq!(Time32SecondType::parse("2:10 PM"), Some(51_000));
+ assert_eq!(Time32SecondType::parse("2:10 pm"), Some(51_000));
+
+ // parse directly as seconds
+ assert_eq!(Time32SecondType::parse("1"), Some(1));
+
+ // leap second
+ assert_eq!(Time32SecondType::parse("23:59:60"), Some(86400));
+
+ // custom format
+ assert_eq!(
+ Time32SecondType::parse_formatted("02 - 10 - 01", "%H - %M - %S"),
+ Some(7_801)
+ );
+ }
}
diff --git a/arrow-csv/src/reader.rs b/arrow-csv/src/reader.rs
index 0bf05960a..4200e9329 100644
--- a/arrow-csv/src/reader.rs
+++ b/arrow-csv/src/reader.rs
@@ -584,6 +584,24 @@ fn parse(
i,
datetime_format,
),
+ DataType::Time32(TimeUnit::Second) => {
+ build_primitive_array::<Time32SecondType>(line_number, rows, i, None)
+ }
+ DataType::Time32(TimeUnit::Millisecond) => build_primitive_array::<
+ Time32MillisecondType,
+ >(
+ line_number, rows, i, None
+ ),
+ DataType::Time64(TimeUnit::Microsecond) => build_primitive_array::<
+ Time64MicrosecondType,
+ >(
+ line_number, rows, i, None
+ ),
+ DataType::Time64(TimeUnit::Nanosecond) => build_primitive_array::<
+ Time64NanosecondType,
+ >(
+ line_number, rows, i, None
+ ),
DataType::Timestamp(TimeUnit::Microsecond, _) => {
build_primitive_array::<TimestampMicrosecondType>(
line_number,
@@ -1593,6 +1611,23 @@ mod tests {
assert_eq!(parse_item::<Date32Type>("1945-05-08").unwrap(), -9004);
}
+ #[test]
+ fn parse_time() {
+ assert_eq!(
+ parse_item::<Time64NanosecondType>("12:10:01.123456789 AM"),
+ Some(601_123_456_789)
+ );
+ assert_eq!(
+ parse_item::<Time64MicrosecondType>("12:10:01.123456 am"),
+ Some(601_123_456)
+ );
+ assert_eq!(
+ parse_item::<Time32MillisecondType>("2:10:01.12 PM"),
+ Some(51_001_120)
+ );
+ assert_eq!(parse_item::<Time32SecondType>("2:10:01 pm"), Some(51_001));
+ }
+
#[test]
fn parse_date64() {
assert_eq!(parse_item::<Date64Type>("1970-01-01T00:00:00").unwrap(), 0);