You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/05/20 14:57:37 UTC

[GitHub] [arrow] jonkeane commented on a diff in pull request #13196: ARROW-16407: [R] Extend `parse_date_time` to cover hour, dates, and minutes components

jonkeane commented on code in PR #13196:
URL: https://github.com/apache/arrow/pull/13196#discussion_r878211101


##########
r/R/dplyr-datetime-helpers.R:
##########
@@ -201,19 +213,130 @@ build_formats <- function(orders) {
 }
 
 build_format_from_order <- function(order) {
-  year_chars <- c("%y", "%Y")
-  month_chars <- c("%m", "%B", "%b")
-  day_chars <- "%d"
-
-  outcome <- switch(
-    order,
-    "ymd" = expand.grid(year_chars, month_chars, day_chars),
-    "ydm" = expand.grid(year_chars, day_chars, month_chars),
-    "mdy" = expand.grid(month_chars, day_chars, year_chars),
-    "myd" = expand.grid(month_chars, year_chars, day_chars),
-    "dmy" = expand.grid(day_chars, month_chars, year_chars),
-    "dym" = expand.grid(day_chars, year_chars, month_chars)
+  char_list <- list(
+    "y" = c("%y", "%Y"),
+    "m" = c("%m", "%B", "%b"),
+    "d" = "%d",
+    "H" = "%H",

Review Comment:
   Do we also want | need `%I` here? 



##########
r/R/dplyr-datetime-helpers.R:
##########
@@ -201,19 +213,130 @@ build_formats <- function(orders) {
 }
 
 build_format_from_order <- function(order) {
-  year_chars <- c("%y", "%Y")
-  month_chars <- c("%m", "%B", "%b")
-  day_chars <- "%d"
-
-  outcome <- switch(
-    order,
-    "ymd" = expand.grid(year_chars, month_chars, day_chars),
-    "ydm" = expand.grid(year_chars, day_chars, month_chars),
-    "mdy" = expand.grid(month_chars, day_chars, year_chars),
-    "myd" = expand.grid(month_chars, year_chars, day_chars),
-    "dmy" = expand.grid(day_chars, month_chars, year_chars),
-    "dym" = expand.grid(day_chars, year_chars, month_chars)
+  char_list <- list(
+    "y" = c("%y", "%Y"),
+    "m" = c("%m", "%B", "%b"),
+    "d" = "%d",
+    "H" = "%H",
+    "M" = "%M",
+    "S" = "%S"
+  )
+
+  split_order <- strsplit(order, split = "")[[1]]
+
+  outcome <- expand.grid(char_list[split_order])
+  formats_with_sep <- do.call(paste, c(outcome, sep = "-"))
+  formats_without_sep <- do.call(paste, c(outcome, sep = ""))
+  c(formats_with_sep, formats_without_sep)
+}
+
+process_data_for_parsing <- function(x,
+                                     orders) {
+
+  processed_x <- x$cast(string())
+
+  # make all separators (non-letters and non-numbers) into "-"
+  processed_x <- call_binding("gsub", "[^A-Za-z0-9]", "-", processed_x)
+  # collapse multiple separators into a single one
+  processed_x <- call_binding("gsub", "-{2,}", "-", processed_x)
+
+  # we need to transform `x` when orders are `ym`, `my`, and `yq`
+  # for `ym` and `my` orders we add a day ("01")
+  augmented_x_ym <- NULL
+  if (any(orders %in% c("ym", "my"))) {
+    # add day as "-01" if there is a "-" separator and as "01" if not
+    augmented_x_ym <- call_binding(
+      "if_else",
+      call_binding("grepl", "-", processed_x),
+      call_binding("paste0", processed_x, "-01"),
+      call_binding("paste0", processed_x, "01")
+    )
+  }
+
+  # for `yq` we need to transform the quarter into the start month (lubridate
+  # behaviour) and then add 01 to parse to the first day of the quarter
+  augmented_x_yq <- NULL
+  if (any(orders == "yq")) {
+    # extract everything that comes after the `-` separator, i.e. the quarter
+    # (e.g. 4 from 2022-4)
+    quarter_x <- call_binding("gsub", "^.*?-", "", processed_x)
+    # we should probably error if quarter is not in 1:4
+    # extract everything that comes before the `-`, i.e. the year (e.g. 2002
+    # in 2002-4)
+    year_x <- call_binding("gsub", "-.*$", "", processed_x)
+    quarter_x <- quarter_x$cast(int32())
+    month_x <- (quarter_x - 1) * 3 + 1
+    augmented_x_yq <- call_binding("paste0", year_x, "-", month_x, "-01")
+  }
+
+  list(
+    "augmented_x_ym" = augmented_x_ym,
+    "augmented_x_yq" = augmented_x_yq,
+    "processed_x" = processed_x
+  )
+}
+
+attempt_parsing <- function(x,
+                            orders,
+                            formats = NULL) {
+  if (is.null(formats)) {
+    # this is the situation in which orders were passed with `exact = TRUE`
+    # no data processing takes place
+    # we don't derive formats as the orders are assumed to be formats
+    parse_attempt_expressions <- build_strptime_exps(x, orders)
+    return(parse_attempt_expressions)

Review Comment:
   ```suggestion
       return(build_strptime_exps(x, orders))
   ```
   
   Slightly more simplified?



##########
r/tests/testthat/test-dplyr-funcs-datetime.R:
##########
@@ -1942,3 +1925,206 @@ test_that("lubridate's fast_strptime", {
       collect()
   )
 })
+
+test_that("parse_date_time with hours, minutes and seconds components", {
+  test_dates_times <- tibble(
+    ymd_hms_string =
+      c("67-01-09 12:34:56", "1970-05-22 20:13:59", "870822201359", NA),
+    ymd_hm_string =
+      c("67-01-09 12:34", "1970-05-22 20:13", "8708222013", NA),
+    ymd_h_string =
+      c("67-01-09 12", "1970-05-22 20", "87082220", NA),
+    dmy_hms_string =
+      c("09-01-67 12:34:56", "22-05-1970 20:13:59", "220887201359", NA),
+    dmy_hm_string =
+      c("09-01-67 12:34", "22-05-1970 20:13",  "2208872013", NA),
+    dmy_h_string =
+      c("09-01-67 12", "22-05-1970 20", "22088720", NA),
+    mdy_hms_string =
+      c("01-09-67 12:34:56", "05-22-1970 20:13:59", "082287201359", NA),
+    mdy_hm_string =
+      c("01-09-67 12:34", "05-22-1970 20:13", "0822872013", NA),
+    mdy_h_string =
+      c("01-09-67 12", "05-22-1970 20", "08228720", NA),
+    ydm_hms_string =
+      c("67-09-01 12:34:56", "1970-22-05 20:13:59", "872208201359", NA),
+    ydm_hm_string =
+      c("67-09-01 12:34", "1970-22-05 20:13", "8722082013", NA),
+    ydm_h_string =
+      c("67-09-01 12", "1970-22-05 20", "87220820", NA)
+  )
+  # the unseparated strings are versions of "1987-08-22 20:13:59" (with %y)
+
+  # these functions' internals use some string processing which requires the
+  # RE2 library (not available on Windows with R 3.6)
+  skip_if_not_available("re2")
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        ymd_hms_dttm = parse_date_time(ymd_hms_string, orders = "ymd_HMS"),
+        ymd_hm_dttm  = parse_date_time(ymd_hm_string, orders = "ymd_HM"),
+        ymd_h_dttm   = parse_date_time(ymd_h_string, orders = "ymd_H"),
+        dmy_hms_dttm = parse_date_time(dmy_hms_string, orders = "dmy_HMS"),
+        dmy_hm_dttm  = parse_date_time(dmy_hm_string, orders = "dmy_HM"),
+        dmy_h_dttm   = parse_date_time(dmy_h_string, orders = "dmy_H"),
+        mdy_hms_dttm = parse_date_time(mdy_hms_string, orders = "mdy_HMS"),
+        mdy_hm_dttm  = parse_date_time(mdy_hm_string, orders = "mdy_HM"),
+        mdy_h_dttm   = parse_date_time(mdy_h_string, orders = "mdy_H"),
+        ydm_hms_dttm = parse_date_time(ydm_hms_string, orders = "ydm_HMS"),
+        ydm_hm_dttm  = parse_date_time(ydm_hm_string, orders = "ydmHM"),
+        ydm_h_dttm   = parse_date_time(ydm_h_string, orders = "ydmH")
+      ) %>%
+      collect(),
+    test_dates_times
+  )
+
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        ymd_hms_dttm =
+          parse_date_time(ymd_hms_string, orders = "ymd_HMS", tz = "Pacific/Marquesas"),
+        ymd_hm_dttm =
+          parse_date_time(ymd_hm_string, orders = "ymd_HM", tz = "Pacific/Marquesas"),
+        ymd_h_dttm =
+          parse_date_time(ymd_h_string, orders = "ymd_H", tz = "Pacific/Marquesas"),
+        dmy_hms_dttm =
+          parse_date_time(dmy_hms_string, orders = "dmy_HMS", tz = "Pacific/Marquesas"),
+        dmy_hm_dttm =
+          parse_date_time(dmy_hm_string, orders = "dmy_HM", tz = "Pacific/Marquesas"),
+        dmy_h_dttm =
+          parse_date_time(dmy_h_string, orders = "dmy_H", tz = "Pacific/Marquesas"),
+        mdy_hms_dttm =
+          parse_date_time(mdy_hms_string, orders = "mdy_HMS", tz = "Pacific/Marquesas"),
+        mdy_hm_dttm =
+          parse_date_time(mdy_hm_string, orders = "mdy_HM", tz = "Pacific/Marquesas"),
+        mdy_h_dttm =
+          parse_date_time(mdy_h_string, orders = "mdy_H", tz = "Pacific/Marquesas"),
+        ydm_hms_dttm =
+          parse_date_time(ydm_hms_string, orders = "ydm_HMS", tz = "Pacific/Marquesas"),
+        ydm_hm_dttm =
+          parse_date_time(ydm_hm_string, orders = "ydm_HM", tz = "Pacific/Marquesas"),
+        ydm_h_dttm =
+          parse_date_time(ydm_h_string, orders = "ydm_H", tz = "Pacific/Marquesas")
+      ) %>%
+      collect(),
+    test_dates_times
+  )
+
+  # test truncated formats
+  compare_dplyr_binding(
+    .input %>%
+      mutate(
+        dttm =
+          parse_date_time(
+            truncated_ymd_string,
+            orders = "ymd_HMS",
+            truncated = 3
+          )
+      ) %>%
+      collect(),
+    tibble(
+      truncated_ymd_string =
+        c(
+          "2022-05-19 13:46:51",
+          "2022-05-18 13:46",
+          "2022-05-17 13",
+          "2022-05-16"
+        )
+    )
+  )
+
+  # we need expect_warning twice as both the arrow pipeline (because quiet =
+  # FALSE is not supported) and the fallback dplyr/lubridate one throw
+  # warnings (the lubridate one because quiet is FALSE)
+  expect_warning(
+    expect_warning(
+      tibble(x = c("2022-05-19 13:46:51")) %>%
+        arrow_table() %>%
+        mutate(
+          x_dttm = parse_date_time(x, orders = "dmy_HMS", quiet = FALSE)
+        ) %>%
+        collect(),
+      "`quiet = FALSE` not supported in Arrow"
+    )
+  )

Review Comment:
   Should we assert the second warning here?



##########
r/R/dplyr-datetime-helpers.R:
##########
@@ -201,19 +213,130 @@ build_formats <- function(orders) {
 }
 
 build_format_from_order <- function(order) {
-  year_chars <- c("%y", "%Y")
-  month_chars <- c("%m", "%B", "%b")
-  day_chars <- "%d"
-
-  outcome <- switch(
-    order,
-    "ymd" = expand.grid(year_chars, month_chars, day_chars),
-    "ydm" = expand.grid(year_chars, day_chars, month_chars),
-    "mdy" = expand.grid(month_chars, day_chars, year_chars),
-    "myd" = expand.grid(month_chars, year_chars, day_chars),
-    "dmy" = expand.grid(day_chars, month_chars, year_chars),
-    "dym" = expand.grid(day_chars, year_chars, month_chars)
+  char_list <- list(
+    "y" = c("%y", "%Y"),
+    "m" = c("%m", "%B", "%b"),
+    "d" = "%d",
+    "H" = "%H",
+    "M" = "%M",
+    "S" = "%S"
+  )
+
+  split_order <- strsplit(order, split = "")[[1]]
+
+  outcome <- expand.grid(char_list[split_order])
+  formats_with_sep <- do.call(paste, c(outcome, sep = "-"))
+  formats_without_sep <- do.call(paste, c(outcome, sep = ""))
+  c(formats_with_sep, formats_without_sep)
+}
+
+process_data_for_parsing <- function(x,
+                                     orders) {
+
+  processed_x <- x$cast(string())
+
+  # make all separators (non-letters and non-numbers) into "-"
+  processed_x <- call_binding("gsub", "[^A-Za-z0-9]", "-", processed_x)
+  # collapse multiple separators into a single one
+  processed_x <- call_binding("gsub", "-{2,}", "-", processed_x)
+
+  # we need to transform `x` when orders are `ym`, `my`, and `yq`
+  # for `ym` and `my` orders we add a day ("01")
+  augmented_x_ym <- NULL
+  if (any(orders %in% c("ym", "my"))) {
+    # add day as "-01" if there is a "-" separator and as "01" if not
+    augmented_x_ym <- call_binding(
+      "if_else",
+      call_binding("grepl", "-", processed_x),
+      call_binding("paste0", processed_x, "-01"),
+      call_binding("paste0", processed_x, "01")
+    )
+  }
+
+  # for `yq` we need to transform the quarter into the start month (lubridate
+  # behaviour) and then add 01 to parse to the first day of the quarter
+  augmented_x_yq <- NULL
+  if (any(orders == "yq")) {
+    # extract everything that comes after the `-` separator, i.e. the quarter
+    # (e.g. 4 from 2022-4)
+    quarter_x <- call_binding("gsub", "^.*?-", "", processed_x)
+    # we should probably error if quarter is not in 1:4
+    # extract everything that comes before the `-`, i.e. the year (e.g. 2002
+    # in 2002-4)
+    year_x <- call_binding("gsub", "-.*$", "", processed_x)
+    quarter_x <- quarter_x$cast(int32())
+    month_x <- (quarter_x - 1) * 3 + 1
+    augmented_x_yq <- call_binding("paste0", year_x, "-", month_x, "-01")
+  }
+
+  list(
+    "augmented_x_ym" = augmented_x_ym,
+    "augmented_x_yq" = augmented_x_yq,
+    "processed_x" = processed_x
+  )
+}
+
+attempt_parsing <- function(x,
+                            orders,
+                            formats = NULL) {
+  if (is.null(formats)) {
+    # this is the situation in which orders were passed with `exact = TRUE`
+    # no data processing takes place
+    # we don't derive formats as the orders are assumed to be formats
+    parse_attempt_expressions <- build_strptime_exps(x, orders)
+    return(parse_attempt_expressions)
+  }
+
+  processed_data <- process_data_for_parsing(x, orders)
+
+  processed_x <- processed_data[["processed_x"]]
+  augmented_x_ym <- processed_data[["augmented_x_ym"]]
+  augmented_x_yq <- processed_data[["augmented_x_yq"]]
+
+  # build a list of expressions for each format
+  parse_attempt_exp_processed_x <- build_strptime_exps(processed_x, formats)
+
+  # build separate expression lists of parsing attempts for the orders that
+  # need an augmented `x`
+  # list for attempts when orders %in% c("ym", "my")
+  parse_attempt_exp_augmented_x_ym <- list()
+
+  if (!is.null(augmented_x_ym)) {
+    parse_attempt_exp_augmented_x_ym <- build_strptime_exps(augmented_x_ym, formats)
+  }
+
+  # list for attempts when orders %in% c("yq")
+  parse_attempt_exp_augmented_x_yq <- list()
+  if (!is.null(augmented_x_yq)) {
+    parse_attempt_exp_augmented_x_yq <- build_strptime_exps(augmented_x_yq, formats)
+  }
+
+  # combine all attempts expressions in prep for coalesce
+  # if the users passes only a short order (`ym`, `my` or `yq`) then only use
+  # the corresponding augmented_x
+  if (all(orders == "ym") || all(orders == "my")) {
+    parse_attempt_expressions <- parse_attempt_exp_augmented_x_ym
+  } else if (all(orders == "yq")) {
+    parse_attempt_expressions <- parse_attempt_exp_augmented_x_yq
+  } else {
+    parse_attempt_expressions <- c(
+      # if we have an augmented x give preference to the corresponding
+      # parsing attempts
+      parse_attempt_exp_augmented_x_ym,
+      parse_attempt_exp_augmented_x_yq,
+      parse_attempt_exp_processed_x
+    )
+  }
+  parse_attempt_expressions
+}

Review Comment:
   This is a pretty hairy chunk that's pretty hard to follow. We should make it a bit easier on our future selves to follow and maintain it by doing some combination of (or all) of the following:
   
   * Adding slightly more detailed comments, especially explaining what general shape of things might be in the arguments — it's hard to know now what things are lists of expressions versus lists of characters like `"Ym"` versus  lists of characters like `"%Y-%m"`
   * Refactor this code to work on more directly, instead of having sets of functions that produce lists that you need to subset out of, would it be possible to have functions that take as input (vectors of) single elements and operate on them to go from `"Ym"` to the expression, expanding as necessary.



##########
r/R/dplyr-funcs-datetime.R:
##########
@@ -488,105 +488,28 @@ register_bindings_duration_helpers <- function() {
 register_bindings_datetime_parsers <- function() {
   register_binding("parse_date_time", function(x,
                                                orders,
-                                               tz = "UTC") {
-
-    # each order is translated into possible formats
-    formats <- build_formats(orders)
-
-    x <- x$cast(string())
-
-    # make all separators (non-letters and non-numbers) into "-"
-    x <- call_binding("gsub", "[^A-Za-z0-9]", "-", x)
-    # collapse multiple separators into a single one
-    x <- call_binding("gsub", "-{2,}", "-", x)
-
-    # we need to transform `x` when orders are `ym`, `my`, and `yq`
-    # for `ym` and `my` orders we add a day ("01")
-    augmented_x <- NULL
-    if (any(orders %in% c("ym", "my"))) {
-      augmented_x <- call_binding("paste0", x, "-01")
-    }
-
-    # for `yq` we need to transform the quarter into the start month (lubridate
-    # behaviour) and then add 01 to parse to the first day of the quarter
-    augmented_x2 <- NULL
-    if (any(orders == "yq")) {
-      # extract everything that comes after the `-` separator, i.e. the quarter
-      # (e.g. 4 from 2022-4)
-      quarter_x <- call_binding("gsub", "^.*?-", "", x)
-      # we should probably error if quarter is not in 1:4
-      # extract everything that comes before the `-`, i.e. the year (e.g. 2002
-      # in 2002-4)
-      year_x <- call_binding("gsub", "-.*$", "", x)
-      quarter_x <- quarter_x$cast(int32())
-      month_x <- (quarter_x - 1) * 3 + 1
-      augmented_x2 <- call_binding("paste0", year_x, "-", month_x, "-01")
-    }
-
-    # TODO figure out how to parse strings that have no separators
-    # https://issues.apache.org/jira/browse/ARROW-16446
-    # we could insert separators at the "likely" positions, but it might be
-    # tricky given the possible combinations between dmy formats + locale
-
-    # build a list of expressions for each format
-    parse_attempt_expressions <- map(
-      formats,
-      ~ build_expr(
-        "strptime",
-        x,
-        options = list(
-          format = .x,
-          unit = 0L,
-          error_is_null = TRUE
-        )
-      )
-    )
-
-    # build separate expression lists of parsing attempts for the orders that
-    # need an augmented `x`
-    # list for attempts when orders %in% c("ym", "my")
-    parse_attempt_exp_augmented_x <- list()
-
-    if (!is.null(augmented_x)) {
-      parse_attempt_exp_augmented_x <- map(
-        formats,
-        ~ build_expr(
-          "strptime",
-          augmented_x,
-          options = list(
-            format = .x,
-            unit = 0L,
-            error_is_null = TRUE
-          )
-        )
-      )
+                                               tz = "UTC",
+                                               truncated = 0,
+                                               quiet = TRUE,
+                                               exact = FALSE) {
+    if (!quiet) {
+      arrow_not_supported("`quiet = FALSE`")
     }
 
-    # list for attempts when orders %in% c("yq")
-    parse_attempt_exp_augmented_x2 <- list()
-    if (!is.null(augmented_x2)) {
-      parse_attempt_exp_augmented_x2 <- map(
-        formats,
-        ~ build_expr(
-          "strptime",
-          augmented_x2,
-          options = list(
-            format = .x,
-            unit = 0L,
-            error_is_null = TRUE
-          )
-        )
-      )
+    if (truncated != 0) {
+      # build several orders for truncated formats
+      orders <- map_chr(0:truncated, ~ substr(orders, start = 1, stop = nchar(orders) - .x))

Review Comment:
   Do we have a test where `truncated` is passed and is larger than the smallest order? Something like `lubridate::parse_date_time("2020-01", orders = "Ymd", truncate = 10)`



##########
r/R/dplyr-datetime-helpers.R:
##########
@@ -201,19 +213,130 @@ build_formats <- function(orders) {
 }
 
 build_format_from_order <- function(order) {
-  year_chars <- c("%y", "%Y")
-  month_chars <- c("%m", "%B", "%b")
-  day_chars <- "%d"
-
-  outcome <- switch(
-    order,
-    "ymd" = expand.grid(year_chars, month_chars, day_chars),
-    "ydm" = expand.grid(year_chars, day_chars, month_chars),
-    "mdy" = expand.grid(month_chars, day_chars, year_chars),
-    "myd" = expand.grid(month_chars, year_chars, day_chars),
-    "dmy" = expand.grid(day_chars, month_chars, year_chars),
-    "dym" = expand.grid(day_chars, year_chars, month_chars)
+  char_list <- list(
+    "y" = c("%y", "%Y"),
+    "m" = c("%m", "%B", "%b"),
+    "d" = "%d",
+    "H" = "%H",
+    "M" = "%M",
+    "S" = "%S"
+  )
+
+  split_order <- strsplit(order, split = "")[[1]]
+
+  outcome <- expand.grid(char_list[split_order])
+  formats_with_sep <- do.call(paste, c(outcome, sep = "-"))
+  formats_without_sep <- do.call(paste, c(outcome, sep = ""))
+  c(formats_with_sep, formats_without_sep)

Review Comment:
   This would be something we would want to measure with a benchmark on a bunch of data, but something to consider is instead of doing both with and without separators on all, you could do something like `ifelse(grepl("-", col), formats_with_sep, formats_without_sep)` — though not quite right here, since you would want that to be passed as an expression



##########
r/R/dplyr-datetime-helpers.R:
##########
@@ -201,19 +213,130 @@ build_formats <- function(orders) {
 }
 
 build_format_from_order <- function(order) {
-  year_chars <- c("%y", "%Y")
-  month_chars <- c("%m", "%B", "%b")
-  day_chars <- "%d"
-
-  outcome <- switch(
-    order,
-    "ymd" = expand.grid(year_chars, month_chars, day_chars),
-    "ydm" = expand.grid(year_chars, day_chars, month_chars),
-    "mdy" = expand.grid(month_chars, day_chars, year_chars),
-    "myd" = expand.grid(month_chars, year_chars, day_chars),
-    "dmy" = expand.grid(day_chars, month_chars, year_chars),
-    "dym" = expand.grid(day_chars, year_chars, month_chars)
+  char_list <- list(
+    "y" = c("%y", "%Y"),
+    "m" = c("%m", "%B", "%b"),
+    "d" = "%d",
+    "H" = "%H",
+    "M" = "%M",
+    "S" = "%S"
+  )
+
+  split_order <- strsplit(order, split = "")[[1]]
+
+  outcome <- expand.grid(char_list[split_order])
+  formats_with_sep <- do.call(paste, c(outcome, sep = "-"))
+  formats_without_sep <- do.call(paste, c(outcome, sep = ""))
+  c(formats_with_sep, formats_without_sep)
+}
+
+process_data_for_parsing <- function(x,
+                                     orders) {
+
+  processed_x <- x$cast(string())
+
+  # make all separators (non-letters and non-numbers) into "-"
+  processed_x <- call_binding("gsub", "[^A-Za-z0-9]", "-", processed_x)
+  # collapse multiple separators into a single one
+  processed_x <- call_binding("gsub", "-{2,}", "-", processed_x)
+
+  # we need to transform `x` when orders are `ym`, `my`, and `yq`
+  # for `ym` and `my` orders we add a day ("01")
+  augmented_x_ym <- NULL
+  if (any(orders %in% c("ym", "my"))) {
+    # add day as "-01" if there is a "-" separator and as "01" if not
+    augmented_x_ym <- call_binding(
+      "if_else",
+      call_binding("grepl", "-", processed_x),
+      call_binding("paste0", processed_x, "-01"),
+      call_binding("paste0", processed_x, "01")
+    )
+  }
+
+  # for `yq` we need to transform the quarter into the start month (lubridate
+  # behaviour) and then add 01 to parse to the first day of the quarter
+  augmented_x_yq <- NULL
+  if (any(orders == "yq")) {
+    # extract everything that comes after the `-` separator, i.e. the quarter
+    # (e.g. 4 from 2022-4)
+    quarter_x <- call_binding("gsub", "^.*?-", "", processed_x)
+    # we should probably error if quarter is not in 1:4
+    # extract everything that comes before the `-`, i.e. the year (e.g. 2002
+    # in 2002-4)
+    year_x <- call_binding("gsub", "-.*$", "", processed_x)
+    quarter_x <- quarter_x$cast(int32())
+    month_x <- (quarter_x - 1) * 3 + 1
+    augmented_x_yq <- call_binding("paste0", year_x, "-", month_x, "-01")
+  }

Review Comment:
   Do we have a jira for parsing something like `2020-01` into `2020-01-01` in C++ already? I'm not 100% sure we want to push that kind of logic into C++, but it would be good to if other languages do it as well. If we do, we should list it here to remind ourselves we can get rid of this hack when that issue is done



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org