You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2021/08/19 12:09:16 UTC

[GitHub] [arrow] jorisvandenbossche commented on a change in pull request #10610: ARROW-13033: [C++] Kernel to localize naive timestamps to a timezone (preserving clock-time)

jorisvandenbossche commented on a change in pull request #10610:
URL: https://github.com/apache/arrow/pull/10610#discussion_r692034160



##########
File path: cpp/src/arrow/compute/api_scalar.h
##########
@@ -1029,5 +1055,19 @@ ARROW_EXPORT Result<Datum> Subsecond(const Datum& values, ExecContext* ctx = NUL
 ARROW_EXPORT Result<Datum> Strftime(const Datum& values, StrftimeOptions options,
                                     ExecContext* ctx = NULLPTR);
 
+/// \brief Converts timestamps from one timezone to another for each element of `values`

Review comment:
       Not from one timezone to another, but from local timestamp without timezone to timestamp with timezone, interpreting the local timestamp as being in the specified timezone.

##########
File path: cpp/src/arrow/compute/kernels/scalar_temporal.cc
##########
@@ -570,6 +586,71 @@ struct Strftime {
 };
 #endif
 
+// ----------------------------------------------------------------------
+// Convert timestamps from arbitrary timezone to UTC
+
+template <typename Duration, typename Localizer>
+struct TzLocalize {
+  explicit TzLocalize(const TemporalLocalizationOptions* options, Localizer&& localizer)
+      : options(*options) {}
+
+  template <typename T, typename Arg0>
+  T get_local_time(Arg0 arg) const {
+    return static_cast<T>(
+        arrow_vendored::date::zoned_time<Duration>(
+            options.tz, arrow_vendored::date::local_time<Duration>(Duration{arg}))
+            .get_sys_time()
+            .time_since_epoch()
+            .count());
+  }
+
+  template <typename T, typename Arg0>
+  T get_local_time(Arg0 arg, const arrow_vendored::date::choose choose) const {
+    return static_cast<T>(
+        arrow_vendored::date::zoned_time<Duration>(
+            options.tz, arrow_vendored::date::local_time<Duration>(Duration{arg}), choose)
+            .get_sys_time()
+            .time_since_epoch()
+            .count());
+  }
+
+  template <typename T, typename Arg0>
+  T Call(KernelContext*, Arg0 arg, Status* st) const {
+    try {
+      return get_local_time<T, Arg0>(arg);
+    } catch (const arrow_vendored::date::nonexistent_local_time& e) {
+      switch (options.nonexistent) {
+        case TemporalLocalizationOptions::Nonexistent::NONEXISTENT_RAISE: {
+          *st = Status::Invalid("Nonexistent: ", e.what());
+          return arg;
+        }
+        case TemporalLocalizationOptions::Nonexistent::NONEXISTENT_EARLIEST: {
+          return get_local_time<T, Arg0>(arg, arrow_vendored::date::choose::earliest);
+        }
+        case TemporalLocalizationOptions::Nonexistent::NONEXISTENT_LATEST: {
+          return get_local_time<T, Arg0>(arg, arrow_vendored::date::choose::latest);
+        }
+      }
+    } catch (const arrow_vendored::date::ambiguous_local_time& e) {
+      switch (options.ambiguous) {
+        case TemporalLocalizationOptions::Ambiguous::AMBIGUOUS_RAISE: {
+          *st = Status::Invalid("Ambiguous: ", e.what());

Review comment:
       ```suggestion
             *st = Status::Invalid("Ambiguous timestamp: ", e.what());
   ```

##########
File path: cpp/src/arrow/compute/kernels/scalar_temporal.cc
##########
@@ -570,6 +586,71 @@ struct Strftime {
 };
 #endif
 
+// ----------------------------------------------------------------------
+// Convert timestamps from arbitrary timezone to UTC

Review comment:
       This comment is also a bit confusing (it's technically correct, since this is the conversion that is happening under the hood, but it's not how the kernel is documented)

##########
File path: cpp/src/arrow/compute/kernels/scalar_temporal.cc
##########
@@ -840,6 +967,13 @@ const FunctionDoc strftime_doc{
     {"timestamps"},
     "StrftimeOptions"};
 
+const FunctionDoc tz_localize_doc{
+    "TzLocalize converts timestamps to UTC keeping the original timezone information",
+    ("TzLocalize converts timestamps from arbitrary timezone to UTC.\n"

Review comment:
       Can you update this description?

##########
File path: cpp/src/arrow/compute/kernels/scalar_temporal.cc
##########
@@ -570,6 +586,71 @@ struct Strftime {
 };
 #endif
 
+// ----------------------------------------------------------------------
+// Convert timestamps from arbitrary timezone to UTC
+
+template <typename Duration, typename Localizer>
+struct TzLocalize {
+  explicit TzLocalize(const TemporalLocalizationOptions* options, Localizer&& localizer)
+      : options(*options) {}
+
+  template <typename T, typename Arg0>
+  T get_local_time(Arg0 arg) const {
+    return static_cast<T>(
+        arrow_vendored::date::zoned_time<Duration>(
+            options.tz, arrow_vendored::date::local_time<Duration>(Duration{arg}))
+            .get_sys_time()
+            .time_since_epoch()
+            .count());
+  }
+
+  template <typename T, typename Arg0>
+  T get_local_time(Arg0 arg, const arrow_vendored::date::choose choose) const {
+    return static_cast<T>(
+        arrow_vendored::date::zoned_time<Duration>(
+            options.tz, arrow_vendored::date::local_time<Duration>(Duration{arg}), choose)
+            .get_sys_time()
+            .time_since_epoch()
+            .count());
+  }
+
+  template <typename T, typename Arg0>
+  T Call(KernelContext*, Arg0 arg, Status* st) const {
+    try {
+      return get_local_time<T, Arg0>(arg);
+    } catch (const arrow_vendored::date::nonexistent_local_time& e) {
+      switch (options.nonexistent) {
+        case TemporalLocalizationOptions::Nonexistent::NONEXISTENT_RAISE: {
+          *st = Status::Invalid("Nonexistent: ", e.what());

Review comment:
       ```suggestion
             *st = Status::Invalid("Nonexistent timestamp: ", e.what());
   ```

##########
File path: cpp/src/arrow/compute/kernels/scalar_temporal.cc
##########
@@ -161,6 +162,22 @@ struct TemporalComponentExtract
   }
 };
 
+template <template <typename...> class Op, typename Duration, typename OutType>
+struct TemporalLocalization
+    : public TemporalComponentExtractBase<Op, Duration, OutType> {
+  using Base = TemporalComponentExtractBase<Op, Duration, OutType>;
+
+  static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+    const TemporalLocalizationOptions& options = TemporalLocalizationState::Get(ctx);
+    const auto& timezone = GetInputTimezone(batch.values[0]);
+    if (!timezone.empty() && options.timezone != timezone) {
+      return Status::Invalid("Input and output timezones do not match: ", timezone,

Review comment:
       Does this need to work with timezoned input? It could also always raise an error (so without checking if the timezones are the same)
   
   Also, if we keep it, I would expect it to do nothing? While currently it does actually do some conversion:
   
   ```
   In [21]: ts = pd.to_datetime(["2018-03-10 09:00"]).tz_localize("US/Eastern")
   
   In [22]: tsa = pa.array(ts)
   
   In [23]: tsa.to_pandas()
   Out[23]: 
   0   2018-03-10 09:00:00-05:00
   dtype: datetime64[ns, US/Eastern]
   
   In [24]: pc.tz_localize(tsa, timezone="US/Eastern").to_pandas()
   Out[24]: 
   0   2018-03-10 14:00:00-05:00
   dtype: datetime64[ns, US/Eastern]
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org