You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2022/06/29 09:29:26 UTC
[arrow-rs] branch master updated: Set is_adjusted_to_utc if any timezone set (#1932) (#1953)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 420d6695f Set is_adjusted_to_utc if any timezone set (#1932) (#1953)
420d6695f is described below
commit 420d6695f72381a7a0e1ebd2a18ad4ef6ff0f8b3
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Wed Jun 29 10:29:22 2022 +0100
Set is_adjusted_to_utc if any timezone set (#1932) (#1953)
* Set is_adjusted_to_utc if any timezone set (#1932)
* Fix roundtrip
---
arrow/src/datatypes/datatype.rs | 57 +++++++++++++++++++++++++++++++++++
parquet/src/arrow/schema.rs | 10 ++----
parquet/src/arrow/schema/primitive.rs | 2 +-
3 files changed, 61 insertions(+), 8 deletions(-)
diff --git a/arrow/src/datatypes/datatype.rs b/arrow/src/datatypes/datatype.rs
index d0c9bb692..f3cb58d84 100644
--- a/arrow/src/datatypes/datatype.rs
+++ b/arrow/src/datatypes/datatype.rs
@@ -78,6 +78,63 @@ pub enum DataType {
/// * As used in the Olson time zone database (the "tz database" or
/// "tzdata"), such as "America/New_York"
/// * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
+ ///
+ /// Timestamps with a non-empty timezone
+ /// ------------------------------------
+ ///
+ /// If a Timestamp column has a non-empty timezone value, its epoch is
+ /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone
+ /// (the Unix epoch), regardless of the Timestamp's own timezone.
+ ///
+ /// Therefore, timestamp values with a non-empty timezone correspond to
+ /// physical points in time together with some additional information about
+ /// how the data was obtained and/or how to display it (the timezone).
+ ///
+ /// For example, the timestamp value 0 with the timezone string "Europe/Paris"
+ /// corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the
+ /// application may prefer to display it as "January 1st 1970, 01h00" in
+ /// the Europe/Paris timezone (which is the same physical point in time).
+ ///
+ /// One consequence is that timestamp values with a non-empty timezone
+ /// can be compared and ordered directly, since they all share the same
+ /// well-known point of reference (the Unix epoch).
+ ///
+ /// Timestamps with an unset / empty timezone
+ /// -----------------------------------------
+ ///
+ /// If a Timestamp column has no timezone value, its epoch is
+ /// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone.
+ ///
+ /// Therefore, timestamp values without a timezone cannot be meaningfully
+ /// interpreted as physical points in time, but only as calendar / clock
+ /// indications ("wall clock time") in an unspecified timezone.
+ ///
+ /// For example, the timestamp value 0 with an empty timezone string
+ /// corresponds to "January 1st 1970, 00h00" in an unknown timezone: there
+ /// is not enough information to interpret it as a well-defined physical
+ /// point in time.
+ ///
+ /// One consequence is that timestamp values without a timezone cannot
+ /// be reliably compared or ordered, since they may have different points of
+ /// reference. In particular, it is *not* possible to interpret an unset
+ /// or empty timezone as the same as "UTC".
+ ///
+ /// Conversion between timezones
+ /// ----------------------------
+ ///
+ /// If a Timestamp column has a non-empty timezone, changing the timezone
+ /// to a different non-empty value is a metadata-only operation:
+ /// the timestamp values need not change as their point of reference remains
+ /// the same (the Unix epoch).
+ ///
+ /// However, if a Timestamp column has no timezone value, changing it to a
+ /// non-empty value requires to think about the desired semantics.
+ /// One possibility is to assume that the original timestamp values are
+ /// relative to the epoch of the timezone being set; timestamp values should
+ /// then adjusted to the Unix epoch (for example, changing the timezone from
+ /// empty to "Europe/Paris" would require converting the timestamp values
+ /// from "Europe/Paris" to "UTC", which seems counter-intuitive but is
+ /// nevertheless correct).
Timestamp(TimeUnit, Option<String>),
/// A 32-bit date representing the elapsed time since UNIX epoch (1970-01-01)
/// in days (32 bits).
diff --git a/parquet/src/arrow/schema.rs b/parquet/src/arrow/schema.rs
index 71ae11d08..a65c75853 100644
--- a/parquet/src/arrow/schema.rs
+++ b/parquet/src/arrow/schema.rs
@@ -301,14 +301,10 @@ fn arrow_to_parquet_type(field: &Field) -> Result<Type> {
.build()
}
DataType::Timestamp(time_unit, tz) => {
- let is_utc = tz
- .as_ref()
- .map(|tz| tz == "UTC" || tz == "+00:00" || tz == "-00:00")
- .unwrap_or(false);
-
Type::primitive_type_builder(name, PhysicalType::INT64)
.with_logical_type(Some(LogicalType::Timestamp {
- is_adjusted_to_u_t_c: is_utc,
+ // If timezone set, values are normalized to UTC timezone
+ is_adjusted_to_u_t_c: matches!(tz, Some(z) if !z.as_str().is_empty()),
unit: match time_unit {
TimeUnit::Second => unreachable!(),
TimeUnit::Millisecond => {
@@ -1290,7 +1286,7 @@ mod tests {
REQUIRED INT64 ts_micro_utc (TIMESTAMP(MICROS, true));
REQUIRED INT64 ts_millis_zero_offset (TIMESTAMP(MILLIS, true));
REQUIRED INT64 ts_millis_zero_negative_offset (TIMESTAMP(MILLIS, true));
- REQUIRED INT64 ts_micro_non_utc (TIMESTAMP(MICROS, false));
+ REQUIRED INT64 ts_micro_non_utc (TIMESTAMP(MICROS, true));
REQUIRED GROUP struct {
REQUIRED BOOLEAN bools;
REQUIRED INT32 uint32 (INTEGER(32,false));
diff --git a/parquet/src/arrow/schema/primitive.rs b/parquet/src/arrow/schema/primitive.rs
index 0816b6b2f..0cee5aa1e 100644
--- a/parquet/src/arrow/schema/primitive.rs
+++ b/parquet/src/arrow/schema/primitive.rs
@@ -53,7 +53,7 @@ fn apply_hint(parquet: DataType, hint: DataType) -> DataType {
(DataType::Date32, DataType::Date64) => hint,
// Determine timezone
- (DataType::Timestamp(p, None), DataType::Timestamp(h, Some(_))) if p == h => hint,
+ (DataType::Timestamp(p, _), DataType::Timestamp(h, Some(_))) if p == h => hint,
// Determine offset size
(DataType::Utf8, DataType::LargeUtf8) => hint,