You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "waitingkuo (via GitHub)" <gi...@apache.org> on 2023/04/16 14:28:05 UTC

[GitHub] [arrow-datafusion] waitingkuo commented on a diff in pull request #5982: feat: support month and year interval for date_bin on constant data

waitingkuo commented on code in PR #5982:
URL: https://github.com/apache/arrow-datafusion/pull/5982#discussion_r1167954831


##########
datafusion/physical-expr/src/datetime_expressions.rs:
##########
@@ -333,19 +333,67 @@ pub fn date_trunc(args: &[ColumnarValue]) -> Result<ColumnarValue> {
     })
 }
 
-fn date_bin_single(stride: i64, source: i64, origin: i64) -> i64 {
+// return time in nanoseconds that the source timestamp falls into based on the stride and origin
+fn date_bin_nanos_interval(stride_nanos: i64, source: i64, origin: i64) -> i64 {
     let time_diff = source - origin;
-    // distance to bin
+
+    // distance from origin to bin
+    let time_delta = compute_distance(time_diff, stride_nanos);
+
+    origin + time_delta
+}
+
+// distance from origin to bin
+fn compute_distance(time_diff: i64, stride: i64) -> i64 {
     let time_delta = time_diff - (time_diff % stride);
 
-    let time_delta = if time_diff < 0 && stride > 1 {
+    if time_diff < 0 && stride > 1 {
         // The origin is later than the source timestamp, round down to the previous bin
         time_delta - stride
     } else {
         time_delta
+    }
+}
+
+// return time in nanoseconds that the source timestamp falls into based on the stride and origin
+fn date_bin_months_interval(stride_months: i64, source: i64, origin: i64) -> i64 {
+    // convert source and origin to DateTime<Utc>
+    let source_date = to_utc_date_time(source);
+    let origin_date = to_utc_date_time(origin);
+
+    // calculate the number of months between the source and origin
+    let month_diff = (source_date.year() - origin_date.year()) * 12
+        + source_date.month() as i32
+        - origin_date.month() as i32;
+
+    // distance from origin to bin
+    let month_delta = compute_distance(month_diff as i64, stride_months);
+
+    let mut bin_time = if month_delta < 0 {
+        origin_date - Months::new(month_delta.unsigned_abs() as u32)
+    } else {
+        origin_date + Months::new(month_delta as u32)
     };
 
-    origin + time_delta
+    // If origin is not midnight of first date of the month, the bin_time may be larger than the source
+    // In this case, we need to move back to previous bin
+    if bin_time > source_date {
+        let month_delta = month_delta - stride_months;
+        bin_time = if month_delta < 0 {
+            origin_date - Months::new(month_delta.unsigned_abs() as u32)
+        } else {
+            origin_date + Months::new(month_delta as u32)
+        };
+    }

Review Comment:
   this makes `2000-01-31T00:00:00` becomes `2000-01-29T00:00:00` in 
   `❯ select date_bin('1 month', timestamp '2000-01-31T00:00:00', timestamp '2000-02-29T00:00:00');`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org