You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2021/02/06 11:41:03 UTC

[arrow] branch master updated: ARROW-11443: [Rust] Write datetime information for Date64 Type in csv writer

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new c18aba8  ARROW-11443: [Rust] Write datetime information for Date64 Type in csv writer
c18aba8 is described below

commit c18aba8158634a75e995ff513490892de35e8c06
Author: Ritchie Vink <ri...@gmail.com>
AuthorDate: Sat Feb 6 06:40:12 2021 -0500

    ARROW-11443: [Rust] Write datetime information for Date64 Type in csv writer
    
    `Date64` datatype encodes date and time information in ms. The current csv writer only writes the date, making it impossible to serialize and deserialize whilst retaining the same precision.
    
    This PR proposes to write the date and time information to csv for `Date64` types.
    
    Closes #9379 from ritchie46/csv_writer_date64
    
    Authored-by: Ritchie Vink <ri...@gmail.com>
    Signed-off-by: Andrew Lamb <an...@nerdnetworks.org>
---
 rust/arrow/src/csv/writer.rs | 66 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 63 insertions(+), 3 deletions(-)

diff --git a/rust/arrow/src/csv/writer.rs b/rust/arrow/src/csv/writer.rs
index 7dbb727..e9d8565 100644
--- a/rust/arrow/src/csv/writer.rs
+++ b/rust/arrow/src/csv/writer.rs
@@ -97,6 +97,8 @@ pub struct Writer<W: Write> {
     has_headers: bool,
     /// The date format for date arrays
     date_format: String,
+    /// The datetime format for datetime arrays
+    datetime_format: String,
     /// The timestamp format for timestamp arrays
     timestamp_format: String,
     /// The time format for time arrays
@@ -116,6 +118,7 @@ impl<W: Write> Writer<W> {
             delimiter,
             has_headers: true,
             date_format: DEFAULT_DATE_FORMAT.to_string(),
+            datetime_format: DEFAULT_TIMESTAMP_FORMAT.to_string(),
             time_format: DEFAULT_TIME_FORMAT.to_string(),
             timestamp_format: DEFAULT_TIMESTAMP_FORMAT.to_string(),
             beginning: true,
@@ -169,9 +172,9 @@ impl<W: Write> Writer<W> {
                 }
                 DataType::Date64 => {
                     let c = col.as_any().downcast_ref::<Date64Array>().unwrap();
-                    c.value_as_date(row_index)
+                    c.value_as_datetime(row_index)
                         .unwrap()
-                        .format(&self.date_format)
+                        .format(&self.datetime_format)
                         .to_string()
                 }
                 DataType::Time32(TimeUnit::Second) => {
@@ -292,6 +295,8 @@ pub struct WriterBuilder {
     has_headers: bool,
     /// Optional date format for date arrays
     date_format: Option<String>,
+    /// Optional datetime format for datetime arrays
+    datetime_format: Option<String>,
     /// Optional timestamp format for timestamp arrays
     timestamp_format: Option<String>,
     /// Optional time format for time arrays
@@ -304,6 +309,7 @@ impl Default for WriterBuilder {
             has_headers: true,
             delimiter: None,
             date_format: Some(DEFAULT_DATE_FORMAT.to_string()),
+            datetime_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()),
             time_format: Some(DEFAULT_TIME_FORMAT.to_string()),
             timestamp_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()),
         }
@@ -379,6 +385,9 @@ impl WriterBuilder {
             date_format: self
                 .date_format
                 .unwrap_or_else(|| DEFAULT_DATE_FORMAT.to_string()),
+            datetime_format: self
+                .datetime_format
+                .unwrap_or_else(|| DEFAULT_TIMESTAMP_FORMAT.to_string()),
             time_format: self
                 .time_format
                 .unwrap_or_else(|| DEFAULT_TIME_FORMAT.to_string()),
@@ -394,11 +403,12 @@ impl WriterBuilder {
 mod tests {
     use super::*;
 
+    use crate::csv::Reader;
     use crate::datatypes::{Field, Schema};
     use crate::util::string_writer::StringWriter;
     use crate::util::test_util::get_temp_file;
     use std::fs::File;
-    use std::io::Read;
+    use std::io::{Cursor, Read};
     use std::sync::Arc;
 
     #[test]
@@ -588,4 +598,54 @@ sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03\n";
         let right = writer.writer.into_inner().map(|s| s.to_string());
         assert_eq!(Some(left.to_string()), right.ok());
     }
+
+    #[test]
+    fn test_conversion_consistency() {
+        // test if we can serialize and deserialize whilst retaining the same type information/ precision
+
+        let schema = Schema::new(vec![
+            Field::new("c1", DataType::Date32, false),
+            Field::new("c2", DataType::Date64, false),
+        ]);
+
+        let c1 = Date32Array::from(vec![3, 2, 1]);
+        let c2 = Date64Array::from(vec![3, 2, 1]);
+
+        let batch = RecordBatch::try_new(
+            Arc::new(schema.clone()),
+            vec![Arc::new(c1), Arc::new(c2)],
+        )
+        .unwrap();
+
+        let builder = WriterBuilder::new().has_headers(false);
+
+        let mut buf: Cursor<Vec<u8>> = Default::default();
+        // drop the writer early to release the borrow.
+        {
+            let mut writer = builder.build(&mut buf);
+            writer.write(&batch).unwrap();
+        }
+        buf.set_position(0);
+
+        let mut reader = Reader::new(
+            buf,
+            Arc::new(schema),
+            false,
+            None,
+            3,
+            // starting at row 2 and up to row 6.
+            None,
+            None,
+        );
+        let rb = reader.next().unwrap().unwrap();
+        let c1 = rb.column(0).as_any().downcast_ref::<Date32Array>().unwrap();
+        let c2 = rb.column(1).as_any().downcast_ref::<Date64Array>().unwrap();
+
+        let actual = c1.into_iter().collect::<Vec<_>>();
+        let expected = vec![Some(3), Some(2), Some(1)];
+        assert_eq!(actual, expected);
+        let actual = c2.into_iter().collect::<Vec<_>>();
+        let expected = vec![Some(3), Some(2), Some(1)];
+        assert_eq!(actual, expected);
+    }
 }