You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/06/29 09:29:26 UTC

[arrow-rs] branch master updated: Set is_adjusted_to_utc if any timezone set (#1932) (#1953)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 420d6695f Set is_adjusted_to_utc if any timezone set (#1932) (#1953)
420d6695f is described below

commit 420d6695f72381a7a0e1ebd2a18ad4ef6ff0f8b3
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Wed Jun 29 10:29:22 2022 +0100

    Set is_adjusted_to_utc if any timezone set (#1932) (#1953)
    
    * Set is_adjusted_to_utc if any timezone set (#1932)
    
    * Fix roundtrip
---
 arrow/src/datatypes/datatype.rs       | 57 +++++++++++++++++++++++++++++++++++
 parquet/src/arrow/schema.rs           | 10 ++----
 parquet/src/arrow/schema/primitive.rs |  2 +-
 3 files changed, 61 insertions(+), 8 deletions(-)

diff --git a/arrow/src/datatypes/datatype.rs b/arrow/src/datatypes/datatype.rs
index d0c9bb692..f3cb58d84 100644
--- a/arrow/src/datatypes/datatype.rs
+++ b/arrow/src/datatypes/datatype.rs
@@ -78,6 +78,63 @@ pub enum DataType {
     /// * As used in the Olson time zone database (the "tz database" or
     ///   "tzdata"), such as "America/New_York"
     /// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
+    ///
+    /// Timestamps with a non-empty timezone
+    /// ------------------------------------
+    ///
+    /// If a Timestamp column has a non-empty timezone value, its epoch is
+    /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone
+    /// (the Unix epoch), regardless of the Timestamp's own timezone.
+    ///
+    /// Therefore, timestamp values with a non-empty timezone correspond to
+    /// physical points in time together with some additional information about
+    /// how the data was obtained and/or how to display it (the timezone).
+    ///
+    ///   For example, the timestamp value 0 with the timezone string "Europe/Paris"
+    ///   corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the
+    ///   application may prefer to display it as "January 1st 1970, 01h00" in
+    ///   the Europe/Paris timezone (which is the same physical point in time).
+    ///
+    /// One consequence is that timestamp values with a non-empty timezone
+    /// can be compared and ordered directly, since they all share the same
+    /// well-known point of reference (the Unix epoch).
+    ///
+    /// Timestamps with an unset / empty timezone
+    /// -----------------------------------------
+    ///
+    /// If a Timestamp column has no timezone value, its epoch is
+    /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone.
+    ///
+    /// Therefore, timestamp values without a timezone cannot be meaningfully
+    /// interpreted as physical points in time, but only as calendar / clock
+    /// indications ("wall clock time") in an unspecified timezone.
+    ///
+    ///   For example, the timestamp value 0 with an empty timezone string
+    ///   corresponds to "January 1st 1970, 00h00" in an unknown timezone: there
+    ///   is not enough information to interpret it as a well-defined physical
+    ///   point in time.
+    ///
+    /// One consequence is that timestamp values without a timezone cannot
+    /// be reliably compared or ordered, since they may have different points of
+    /// reference.  In particular, it is *not* possible to interpret an unset
+    /// or empty timezone as the same as "UTC".
+    ///
+    /// Conversion between timezones
+    /// ----------------------------
+    ///
+    /// If a Timestamp column has a non-empty timezone, changing the timezone
+    /// to a different non-empty value is a metadata-only operation:
+    /// the timestamp values need not change as their point of reference remains
+    /// the same (the Unix epoch).
+    ///
+    /// However, if a Timestamp column has no timezone value, changing it to a
+    /// non-empty value requires to think about the desired semantics.
+    /// One possibility is to assume that the original timestamp values are
+    /// relative to the epoch of the timezone being set; timestamp values should
+    /// then adjusted to the Unix epoch (for example, changing the timezone from
+    /// empty to "Europe/Paris" would require converting the timestamp values
+    /// from "Europe/Paris" to "UTC", which seems counter-intuitive but is
+    /// nevertheless correct).
     Timestamp(TimeUnit, Option<String>),
     /// A 32-bit date representing the elapsed time since UNIX epoch (1970-01-01)
     /// in days (32 bits).
diff --git a/parquet/src/arrow/schema.rs b/parquet/src/arrow/schema.rs
index 71ae11d08..a65c75853 100644
--- a/parquet/src/arrow/schema.rs
+++ b/parquet/src/arrow/schema.rs
@@ -301,14 +301,10 @@ fn arrow_to_parquet_type(field: &Field) -> Result<Type> {
                 .build()
         }
         DataType::Timestamp(time_unit, tz) => {
-            let is_utc = tz
-                .as_ref()
-                .map(|tz| tz == "UTC" || tz == "+00:00" || tz == "-00:00")
-                .unwrap_or(false);
-
             Type::primitive_type_builder(name, PhysicalType::INT64)
                 .with_logical_type(Some(LogicalType::Timestamp {
-                    is_adjusted_to_u_t_c: is_utc,
+                    // If timezone set, values are normalized to UTC timezone
+                    is_adjusted_to_u_t_c: matches!(tz, Some(z) if !z.as_str().is_empty()),
                     unit: match time_unit {
                         TimeUnit::Second => unreachable!(),
                         TimeUnit::Millisecond => {
@@ -1290,7 +1286,7 @@ mod tests {
             REQUIRED INT64   ts_micro_utc (TIMESTAMP(MICROS, true));
             REQUIRED INT64   ts_millis_zero_offset (TIMESTAMP(MILLIS, true));
             REQUIRED INT64   ts_millis_zero_negative_offset (TIMESTAMP(MILLIS, true));
-            REQUIRED INT64   ts_micro_non_utc (TIMESTAMP(MICROS, false));
+            REQUIRED INT64   ts_micro_non_utc (TIMESTAMP(MICROS, true));
             REQUIRED GROUP struct {
                 REQUIRED BOOLEAN bools;
                 REQUIRED INT32 uint32 (INTEGER(32,false));
diff --git a/parquet/src/arrow/schema/primitive.rs b/parquet/src/arrow/schema/primitive.rs
index 0816b6b2f..0cee5aa1e 100644
--- a/parquet/src/arrow/schema/primitive.rs
+++ b/parquet/src/arrow/schema/primitive.rs
@@ -53,7 +53,7 @@ fn apply_hint(parquet: DataType, hint: DataType) -> DataType {
         (DataType::Date32, DataType::Date64) => hint,
 
         // Determine timezone
-        (DataType::Timestamp(p, None), DataType::Timestamp(h, Some(_))) if p == h => hint,
+        (DataType::Timestamp(p, _), DataType::Timestamp(h, Some(_))) if p == h => hint,
 
         // Determine offset size
         (DataType::Utf8, DataType::LargeUtf8) => hint,