You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "alamb (via GitHub)" <gi...@apache.org> on 2023/02/07 23:32:17 UTC

[GitHub] [arrow-rs] alamb commented on a diff in pull request #3647: Lazy array display (#3638)

alamb commented on code in PR #3647:
URL: https://github.com/apache/arrow-rs/pull/3647#discussion_r1099394976


##########
arrow-json/src/writer.rs:
##########
@@ -315,47 +293,24 @@ fn set_column_for_json_rows(
                 row_count
             );
         }
-        DataType::Date32 => {
-            set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count);
-        }
-        DataType::Date64 => {
-            set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count);
-        }
-        DataType::Timestamp(TimeUnit::Second, _) => {
-            set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count);
-        }
-        DataType::Timestamp(TimeUnit::Millisecond, _) => {
-            set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count);
-        }
-        DataType::Timestamp(TimeUnit::Microsecond, _) => {
-            set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count);
-        }
-        DataType::Timestamp(TimeUnit::Nanosecond, _) => {
-            set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count);
-        }
-        DataType::Time32(TimeUnit::Second) => {
-            set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count);
-        }
-        DataType::Time32(TimeUnit::Millisecond) => {
-            set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count);
-        }
-        DataType::Time64(TimeUnit::Microsecond) => {
-            set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count);
-        }
-        DataType::Time64(TimeUnit::Nanosecond) => {
-            set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count);
-        }
-        DataType::Duration(TimeUnit::Second) => {
-            set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count);
-        }
-        DataType::Duration(TimeUnit::Millisecond) => {
-            set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count);
-        }
-        DataType::Duration(TimeUnit::Microsecond) => {
-            set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count);
-        }
-        DataType::Duration(TimeUnit::Nanosecond) => {
-            set_temporal_column_by_array_type!(col_name, col_idx, rows, array, row_count);
+        DataType::Date32
+        | DataType::Date64
+        | DataType::Timestamp(_, _)
+        | DataType::Time32(_)
+        | DataType::Time64(_)
+        | DataType::Duration(_) => {
+            // TODO: Set options

Review Comment:
   Is this a TODO for this PR or for some future PR? It seems like the default options work well enough for this?



##########
arrow-json/src/writer.rs:
##########
@@ -937,7 +892,7 @@ mod tests {
 
         assert_json_eq(
             &buf,
-            r#"{"date32":"2018-11-13","date64":"2018-11-13","name":"a"}
+            r#"{"date32":"2018-11-13","date64":"2018-11-13T17:11:10.011","name":"a"}

Review Comment:
   Is the idea for this change that since Date64 can repreent milliseconds it should be formatted like a timestamp even thought that may be surprising?
   
   https://github.com/apache/arrow/blob/39bad5442c6447bf07594b09e4b29118b3211460/format/Schema.fbs#L214-L215
   
   I admit the docs aren't very helpful in this respect: https://docs.rs/arrow/32.0.0/arrow/array/type.Date64Array.html



##########
arrow-schema/src/datatype.rs:
##########
@@ -290,7 +290,7 @@ pub enum IntervalUnit {
 }
 
 // Sparse or Dense union layouts
-#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
+#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Copy)]

Review Comment:
   👍 



##########
parquet/src/arrow/arrow_writer/mod.rs:
##########
@@ -2314,17 +2314,17 @@ mod tests {
         // Verify data is as expected
 
         let expected = r#"
-            +-------------------------------------------------------------------------------------------------------------------------------------+
-            | struct_b                                                                                                                            |
-            +-------------------------------------------------------------------------------------------------------------------------------------+
-            | {"list": [{"leaf_a": 1, "leaf_b": 1}]}                                                                                              |
-            | {"list": null}                                                                                                                      |
-            | {"list": [{"leaf_a": 2, "leaf_b": null}, {"leaf_a": 3, "leaf_b": 2}]}                                                               |
-            | {"list": null}                                                                                                                      |
-            | {"list": [{"leaf_a": 4, "leaf_b": null}, {"leaf_a": 5, "leaf_b": null}]}                                                            |
-            | {"list": [{"leaf_a": 6, "leaf_b": null}, {"leaf_a": 7, "leaf_b": null}, {"leaf_a": 8, "leaf_b": null}, {"leaf_a": 9, "leaf_b": 1}]} |
-            | {"list": [{"leaf_a": 10, "leaf_b": null}]}                                                                                          |
-            +-------------------------------------------------------------------------------------------------------------------------------------+
+            +-------------------------------------------------------------------------------------------------------+

Review Comment:
   this change certainly looks nicer to me -- I wonder if someone wants the old behavior (which looks like JSON), they should use the JSON writer, correct?



##########
arrow/src/util/pretty.rs:
##########
@@ -447,48 +445,24 @@ mod tests {
             "|                           |",
             "+---------------------------+",
         ];
-        check_datetime_with_timezone!(
-            TimestampSecondArray,
-            11111111,
-            "+08:00".to_string(),
-            expected
-        );
+        let actual: Vec<&str> = table.lines().collect();
+        assert_eq!(expected, actual, "Actual result:\n\n{actual:#?}\n\n");
     }
 
     #[test]
+    #[cfg(not(feature = "chrono-tz"))]
     fn test_pretty_format_timestamp_second_with_incorrect_fixed_offset_timezone() {
-        let expected = vec![
-            "+-------------------------------------------------+",
-            "| f                                               |",
-            "+-------------------------------------------------+",
-            "| 1970-05-09T14:25:11 (Unknown Time Zone '08:00') |",
-            "|                                                 |",
-            "+-------------------------------------------------+",
-        ];
-        check_datetime_with_timezone!(
-            TimestampSecondArray,
-            11111111,
-            "08:00".to_string(),
-            expected
-        );
+        let batch = timestamp_batch::<TimestampSecondType>("08:00", 11111111);

Review Comment:
   I wonder why this change in behavior to error rather than print out "ERROR" -- is it because there was no way to return errors before? I wonder if people would be relying on the old behavior somehow?



##########
arrow/src/util/pretty.rs:
##########
@@ -559,12 +533,12 @@ mod tests {
     #[test]
     fn test_pretty_format_date_64() {
         let expected = vec![
-            "+------------+",
-            "| f          |",
-            "+------------+",
-            "| 2005-03-18 |",
-            "|            |",
-            "+------------+",
+            "+---------------------+",
+            "| f                   |",
+            "+---------------------+",
+            "| 2005-03-18T01:58:20 |",

Review Comment:
   This is the same change as in the json code -- I realize Date64 has precision greater than a day, but I wonder if people will be surprised by this change 🤔 



##########
arrow-cast/src/display.rs:
##########
@@ -19,57 +19,499 @@
 //! purposes. See the `pretty` crate for additional functions for
 //! record batch pretty printing.
 
-use std::fmt::Write;
-use std::sync::Arc;
+use std::fmt::{Debug, Display, Formatter, Write};
+use std::ops::Range;
 
+use arrow_array::cast::*;
+use arrow_array::temporal_conversions::*;
 use arrow_array::timezone::Tz;
 use arrow_array::types::*;
 use arrow_array::*;
 use arrow_buffer::ArrowNativeType;
 use arrow_schema::*;
-use chrono::prelude::SecondsFormat;
-use chrono::{DateTime, Utc};
+use chrono::{NaiveDate, NaiveDateTime, SecondsFormat, TimeZone, Utc};
+use lexical_core::FormattedSize;
 
-fn invalid_cast_error(dt: &str, col_idx: usize, row_idx: usize) -> ArrowError {
-    ArrowError::CastError(format!(
-        "Cannot cast to {dt} at col index: {col_idx} row index: {row_idx}"
-    ))
+type TimeFormat<'a> = Option<&'a str>;
+
+/// Options for formatting arrays
+///
+/// By default nulls are formatted as `""` and temporal types formatted
+/// according to RFC3339
+///
+#[derive(Debug, Clone)]
+pub struct FormatOptions<'a> {
+    safe: bool,
+    /// Format string for nulls
+    null: &'a str,
+    /// Date format for date arrays
+    date_format: TimeFormat<'a>,
+    /// Format for DateTime arrays
+    datetime_format: TimeFormat<'a>,
+    /// Timestamp format for timestamp arrays
+    timestamp_format: TimeFormat<'a>,
+    /// Timestamp format for timestamp with timezone arrays
+    timestamp_tz_format: TimeFormat<'a>,
+    /// Time format for time arrays
+    time_format: TimeFormat<'a>,
+}
+
+impl<'a> Default for FormatOptions<'a> {
+    fn default() -> Self {
+        Self {
+            safe: true,
+            null: "",
+            date_format: None,
+            datetime_format: None,
+            timestamp_format: None,
+            timestamp_tz_format: None,
+            time_format: None,
+        }
+    }
+}
+
+impl<'a> FormatOptions<'a> {
+    /// If set to `true` any formatting errors will be written to the output
+    /// instead of being converted into a [`std::fmt::Error`]
+    pub fn with_display_error(mut self, safe: bool) -> Self {
+        self.safe = safe;
+        self
+    }
+
+    /// Overrides the string used to represent a null
+    ///
+    /// Defaults to `""`
+    pub fn with_null(self, null: &'a str) -> Self {
+        Self { null, ..self }
+    }
+
+    /// Overrides the format used for [`DataType::Date32`] columns
+    pub fn with_date_format(self, date_format: Option<&'a str>) -> Self {
+        Self {
+            date_format,
+            ..self
+        }
+    }
+
+    /// Overrides the format used for [`DataType::Date64`] columns
+    pub fn with_datetime_format(self, datetime_format: Option<&'a str>) -> Self {
+        Self {
+            datetime_format,
+            ..self
+        }
+    }
+
+    /// Overrides the format used for [`DataType::Timestamp`] columns without a timezone
+    pub fn with_timestamp_format(self, timestamp_format: Option<&'a str>) -> Self {
+        Self {
+            timestamp_format,
+            ..self
+        }
+    }
+
+    /// Overrides the format used for [`DataType::Timestamp`] columns with a timezone
+    pub fn with_timestamp_tz_format(self, timestamp_tz_format: Option<&'a str>) -> Self {
+        Self {
+            timestamp_tz_format,
+            ..self
+        }
+    }
+
+    /// Overrides the format used for [`DataType::Time32`] and [`DataType::Time64`] columns
+    pub fn with_time_format(self, time_format: Option<&'a str>) -> Self {
+        Self {
+            time_format,
+            ..self
+        }
+    }
+}
+
+/// Implements [`Display`] for a specific array value
+pub struct ValueFormatter<'a> {
+    idx: usize,
+    formatter: &'a ArrayFormatter<'a>,
+}
+
+impl<'a> ValueFormatter<'a> {
+    /// Writes this value to the provided [`Write`]
+    ///
+    /// Note: this ignores [`FormatOptions::with_display_error`]

Review Comment:
   ```suggestion
       /// Note: this ignores [`FormatOptions::with_display_error`] and
       /// will return an error on formatting issue
   ```



##########
arrow-cast/src/display.rs:
##########
@@ -19,57 +19,499 @@
 //! purposes. See the `pretty` crate for additional functions for
 //! record batch pretty printing.
 
-use std::fmt::Write;
-use std::sync::Arc;
+use std::fmt::{Debug, Display, Formatter, Write};
+use std::ops::Range;
 
+use arrow_array::cast::*;
+use arrow_array::temporal_conversions::*;
 use arrow_array::timezone::Tz;
 use arrow_array::types::*;
 use arrow_array::*;
 use arrow_buffer::ArrowNativeType;
 use arrow_schema::*;
-use chrono::prelude::SecondsFormat;
-use chrono::{DateTime, Utc};
+use chrono::{NaiveDate, NaiveDateTime, SecondsFormat, TimeZone, Utc};
+use lexical_core::FormattedSize;
 
-fn invalid_cast_error(dt: &str, col_idx: usize, row_idx: usize) -> ArrowError {
-    ArrowError::CastError(format!(
-        "Cannot cast to {dt} at col index: {col_idx} row index: {row_idx}"
-    ))
+type TimeFormat<'a> = Option<&'a str>;
+
+/// Options for formatting arrays
+///
+/// By default nulls are formatted as `""` and temporal types formatted
+/// according to RFC3339
+///
+#[derive(Debug, Clone)]
+pub struct FormatOptions<'a> {
+    safe: bool,
+    /// Format string for nulls
+    null: &'a str,
+    /// Date format for date arrays
+    date_format: TimeFormat<'a>,
+    /// Format for DateTime arrays
+    datetime_format: TimeFormat<'a>,
+    /// Timestamp format for timestamp arrays
+    timestamp_format: TimeFormat<'a>,
+    /// Timestamp format for timestamp with timezone arrays
+    timestamp_tz_format: TimeFormat<'a>,
+    /// Time format for time arrays
+    time_format: TimeFormat<'a>,
+}
+
+impl<'a> Default for FormatOptions<'a> {
+    fn default() -> Self {
+        Self {
+            safe: true,
+            null: "",
+            date_format: None,
+            datetime_format: None,
+            timestamp_format: None,
+            timestamp_tz_format: None,
+            time_format: None,
+        }
+    }
+}
+
+impl<'a> FormatOptions<'a> {
+    /// If set to `true` any formatting errors will be written to the output

Review Comment:
   👍 



##########
arrow-cast/src/display.rs:
##########
@@ -19,57 +19,499 @@
 //! purposes. See the `pretty` crate for additional functions for
 //! record batch pretty printing.
 
-use std::fmt::Write;
-use std::sync::Arc;
+use std::fmt::{Debug, Display, Formatter, Write};
+use std::ops::Range;
 
+use arrow_array::cast::*;
+use arrow_array::temporal_conversions::*;
 use arrow_array::timezone::Tz;
 use arrow_array::types::*;
 use arrow_array::*;
 use arrow_buffer::ArrowNativeType;
 use arrow_schema::*;
-use chrono::prelude::SecondsFormat;
-use chrono::{DateTime, Utc};
+use chrono::{NaiveDate, NaiveDateTime, SecondsFormat, TimeZone, Utc};
+use lexical_core::FormattedSize;
 
-fn invalid_cast_error(dt: &str, col_idx: usize, row_idx: usize) -> ArrowError {
-    ArrowError::CastError(format!(
-        "Cannot cast to {dt} at col index: {col_idx} row index: {row_idx}"
-    ))
+type TimeFormat<'a> = Option<&'a str>;
+
+/// Options for formatting arrays
+///
+/// By default nulls are formatted as `""` and temporal types formatted
+/// according to RFC3339
+///
+#[derive(Debug, Clone)]
+pub struct FormatOptions<'a> {
+    safe: bool,
+    /// Format string for nulls
+    null: &'a str,
+    /// Date format for date arrays
+    date_format: TimeFormat<'a>,
+    /// Format for DateTime arrays
+    datetime_format: TimeFormat<'a>,
+    /// Timestamp format for timestamp arrays
+    timestamp_format: TimeFormat<'a>,
+    /// Timestamp format for timestamp with timezone arrays
+    timestamp_tz_format: TimeFormat<'a>,
+    /// Time format for time arrays
+    time_format: TimeFormat<'a>,
+}
+
+impl<'a> Default for FormatOptions<'a> {
+    fn default() -> Self {
+        Self {
+            safe: true,
+            null: "",
+            date_format: None,
+            datetime_format: None,
+            timestamp_format: None,
+            timestamp_tz_format: None,
+            time_format: None,
+        }
+    }
+}
+
+impl<'a> FormatOptions<'a> {
+    /// If set to `true` any formatting errors will be written to the output
+    /// instead of being converted into a [`std::fmt::Error`]
+    pub fn with_display_error(mut self, safe: bool) -> Self {
+        self.safe = safe;
+        self
+    }
+
+    /// Overrides the string used to represent a null
+    ///
+    /// Defaults to `""`
+    pub fn with_null(self, null: &'a str) -> Self {
+        Self { null, ..self }
+    }
+
+    /// Overrides the format used for [`DataType::Date32`] columns
+    pub fn with_date_format(self, date_format: Option<&'a str>) -> Self {
+        Self {
+            date_format,
+            ..self
+        }
+    }
+
+    /// Overrides the format used for [`DataType::Date64`] columns
+    pub fn with_datetime_format(self, datetime_format: Option<&'a str>) -> Self {
+        Self {
+            datetime_format,
+            ..self
+        }
+    }
+
+    /// Overrides the format used for [`DataType::Timestamp`] columns without a timezone
+    pub fn with_timestamp_format(self, timestamp_format: Option<&'a str>) -> Self {
+        Self {
+            timestamp_format,
+            ..self
+        }
+    }
+
+    /// Overrides the format used for [`DataType::Timestamp`] columns with a timezone
+    pub fn with_timestamp_tz_format(self, timestamp_tz_format: Option<&'a str>) -> Self {
+        Self {
+            timestamp_tz_format,
+            ..self
+        }
+    }
+
+    /// Overrides the format used for [`DataType::Time32`] and [`DataType::Time64`] columns
+    pub fn with_time_format(self, time_format: Option<&'a str>) -> Self {
+        Self {
+            time_format,
+            ..self
+        }
+    }
+}
+
+/// Implements [`Display`] for a specific array value
+pub struct ValueFormatter<'a> {

Review Comment:
   this is quite clever 👏 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org