You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2019/04/19 12:34:34 UTC
[arrow] branch master updated: ARROW-4805: [Rust] Write temporal
arrays to CSV
This is an automated email from the ASF dual-hosted git repository.
kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 9dd8668 ARROW-4805: [Rust] Write temporal arrays to CSV
9dd8668 is described below
commit 9dd8668eb9e241d722e0ddfa772860a859dc560b
Author: Neville Dipale <ne...@gmail.com>
AuthorDate: Fri Apr 19 14:34:15 2019 +0200
ARROW-4805: [Rust] Write temporal arrays to CSV
Author: Neville Dipale <ne...@gmail.com>
Closes #4170 from nevi-me/ARROW-4805 and squashes the following commits:
084928297 <Neville Dipale> fix csv test case
e8c4ff887 <Neville Dipale> ARROW-4805: Write temporal arrays to CSV
---
rust/arrow/src/csv/writer.rs | 177 +++++++++++++++++++++++++++++++++++++++++--
1 file changed, 169 insertions(+), 8 deletions(-)
diff --git a/rust/arrow/src/csv/writer.rs b/rust/arrow/src/csv/writer.rs
index 945fb71..f25a2f6 100644
--- a/rust/arrow/src/csv/writer.rs
+++ b/rust/arrow/src/csv/writer.rs
@@ -53,7 +53,8 @@
//! let batch = RecordBatch::try_new(
//! Arc::new(schema),
//! vec![Arc::new(c1), Arc::new(c2), Arc::new(c3), Arc::new(c4)],
-//! ).unwrap();
+//! )
+//! .unwrap();
//!
//! let file = get_temp_file("out.csv", &[]);
//!
@@ -70,6 +71,10 @@ use crate::datatypes::*;
use crate::error::{ArrowError, Result};
use crate::record_batch::RecordBatch;
+const DEFAULT_DATE_FORMAT: &str = "%F";
+const DEFAULT_TIME_FORMAT: &str = "%T";
+const DEFAULT_TIMESTAMP_FORMAT: &str = "%FT%H:%M:%S.%9f";
+
fn write_primitive_value<T>(array: &ArrayRef, i: usize) -> String
where
T: ArrowNumericType,
@@ -87,6 +92,12 @@ pub struct Writer {
delimiter: u8,
/// Whether file should be written with headers. Defaults to `true`
has_headers: bool,
+ /// The date format for date arrays
+ date_format: String,
+ /// The timestamp format for timestamp arrays
+ timestamp_format: String,
+ /// The time format for time arrays
+ time_format: String,
}
impl Writer {
@@ -96,6 +107,9 @@ impl Writer {
file,
delimiter: b',',
has_headers: true,
+ date_format: DEFAULT_DATE_FORMAT.to_string(),
+ time_format: DEFAULT_TIME_FORMAT.to_string(),
+ timestamp_format: DEFAULT_TIMESTAMP_FORMAT.to_string(),
}
}
@@ -169,6 +183,88 @@ impl Writer {
let c = col.as_any().downcast_ref::<BinaryArray>().unwrap();
String::from_utf8(c.value(row_index).to_vec())?
}
+ DataType::Date32(DateUnit::Day) => {
+ let c = col.as_any().downcast_ref::<Date32Array>().unwrap();
+ c.value_as_date(row_index)
+ .unwrap()
+ .format(&self.date_format)
+ .to_string()
+ }
+ DataType::Date64(DateUnit::Millisecond) => {
+ let c = col.as_any().downcast_ref::<Date64Array>().unwrap();
+ c.value_as_date(row_index)
+ .unwrap()
+ .format(&self.date_format)
+ .to_string()
+ }
+ DataType::Time32(TimeUnit::Second) => {
+ let c =
+ col.as_any().downcast_ref::<Time32SecondArray>().unwrap();
+ c.value_as_time(row_index)
+ .unwrap()
+ .format(&self.time_format)
+ .to_string()
+ }
+ DataType::Time32(TimeUnit::Millisecond) => {
+ let c = col
+ .as_any()
+ .downcast_ref::<Time32MillisecondArray>()
+ .unwrap();
+ c.value_as_time(row_index)
+ .unwrap()
+ .format(&self.time_format)
+ .to_string()
+ }
+ DataType::Time64(TimeUnit::Microsecond) => {
+ let c = col
+ .as_any()
+ .downcast_ref::<Time64MicrosecondArray>()
+ .unwrap();
+ c.value_as_time(row_index)
+ .unwrap()
+ .format(&self.time_format)
+ .to_string()
+ }
+ DataType::Time64(TimeUnit::Nanosecond) => {
+ let c = col
+ .as_any()
+ .downcast_ref::<Time64NanosecondArray>()
+ .unwrap();
+ c.value_as_time(row_index)
+ .unwrap()
+ .format(&self.time_format)
+ .to_string()
+ }
+ DataType::Timestamp(time_unit) => {
+ use TimeUnit::*;
+ let datetime = match time_unit {
+ Second => col
+ .as_any()
+ .downcast_ref::<TimestampSecondArray>()
+ .unwrap()
+ .value_as_datetime(row_index)
+ .unwrap(),
+ Millisecond => col
+ .as_any()
+ .downcast_ref::<TimestampMillisecondArray>()
+ .unwrap()
+ .value_as_datetime(row_index)
+ .unwrap(),
+ Microsecond => col
+ .as_any()
+ .downcast_ref::<TimestampMicrosecondArray>()
+ .unwrap()
+ .value_as_datetime(row_index)
+ .unwrap(),
+ Nanosecond => col
+ .as_any()
+ .downcast_ref::<TimestampNanosecondArray>()
+ .unwrap()
+ .value_as_datetime(row_index)
+ .unwrap(),
+ };
+ format!("{}", datetime.format(&self.timestamp_format))
+ }
t => {
// List and Struct arrays not supported by the writer, any
// other type needs to be implemented
@@ -196,6 +292,12 @@ pub struct WriterBuilder {
delimiter: Option<u8>,
/// Whether to write column names as file headers. Defaults to `true`
has_headers: bool,
+ /// Optional date format for date arrays
+ date_format: Option<String>,
+ /// Optional timestamp format for timestamp arrays
+ timestamp_format: Option<String>,
+ /// Optional time format for time arrays
+ time_format: Option<String>,
}
impl Default for WriterBuilder {
@@ -203,6 +305,9 @@ impl Default for WriterBuilder {
Self {
has_headers: true,
delimiter: None,
+ date_format: Some(DEFAULT_DATE_FORMAT.to_string()),
+ time_format: Some(DEFAULT_TIME_FORMAT.to_string()),
+ timestamp_format: Some(DEFAULT_TIMESTAMP_FORMAT.to_string()),
}
}
}
@@ -246,12 +351,35 @@ impl WriterBuilder {
self
}
+ /// Set the CSV file's date format
+ pub fn with_date_format(mut self, format: String) -> Self {
+ self.date_format = Some(format);
+ self
+ }
+
+ /// Set the CSV file's time format
+ pub fn with_time_format(mut self, format: String) -> Self {
+ self.time_format = Some(format);
+ self
+ }
+
+ /// Set the CSV file's timestamp format
+ pub fn with_timestamp_format(mut self, format: String) -> Self {
+ self.timestamp_format = Some(format);
+ self
+ }
+
/// Create a new `Writer`
pub fn build(self, file: File) -> Writer {
Writer {
file,
delimiter: self.delimiter.unwrap_or(b','),
has_headers: self.has_headers,
+ date_format: self.date_format.unwrap_or(DEFAULT_DATE_FORMAT.to_string()),
+ time_format: self.time_format.unwrap_or(DEFAULT_TIME_FORMAT.to_string()),
+ timestamp_format: self
+ .timestamp_format
+ .unwrap_or(DEFAULT_TIMESTAMP_FORMAT.to_string()),
}
}
}
@@ -271,7 +399,9 @@ mod tests {
Field::new("c1", DataType::Utf8, false),
Field::new("c2", DataType::Float64, true),
Field::new("c3", DataType::UInt32, false),
- Field::new("c3", DataType::Boolean, true),
+ Field::new("c4", DataType::Boolean, true),
+ Field::new("c5", DataType::Timestamp(TimeUnit::Millisecond), true),
+ Field::new("c6", DataType::Time32(TimeUnit::Second), false),
]);
let c1 = BinaryArray::from(vec![
@@ -286,10 +416,23 @@ mod tests {
]);
let c3 = PrimitiveArray::<UInt32Type>::from(vec![3, 2, 1]);
let c4 = PrimitiveArray::<BooleanType>::from(vec![Some(true), Some(false), None]);
+ let c5 = TimestampMillisecondArray::from(vec![
+ None,
+ Some(1555584887378),
+ Some(1555555555555),
+ ]);
+ let c6 = Time32SecondArray::from(vec![1234, 24680, 85563]);
let batch = RecordBatch::try_new(
Arc::new(schema),
- vec![Arc::new(c1), Arc::new(c2), Arc::new(c3), Arc::new(c4)],
+ vec![
+ Arc::new(c1),
+ Arc::new(c2),
+ Arc::new(c3),
+ Arc::new(c4),
+ Arc::new(c5),
+ Arc::new(c6),
+ ],
)
.unwrap();
@@ -304,7 +447,14 @@ mod tests {
file.read_to_end(&mut buffer).unwrap();
assert_eq!(
- "c1,c2,c3,c3\nLorem ipsum dolor sit amet,123.564532,3,true\nconsectetur adipiscing elit,,2,false\nsed do eiusmod tempor,-556132.25,1,\nLorem ipsum dolor sit amet,123.564532,3,true\nconsectetur adipiscing elit,,2,false\nsed do eiusmod tempor,-556132.25,1,\n"
+ r#"c1,c2,c3,c4,c5,c6
+Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34
+consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20
+sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03
+Lorem ipsum dolor sit amet,123.564532,3,true,,00:20:34
+consectetur adipiscing elit,,2,false,2019-04-18T10:54:47.378000000,06:51:20
+sed do eiusmod tempor,-556132.25,1,,2019-04-18T02:45:55.555000000,23:46:03
+"#
.to_string(),
String::from_utf8(buffer).unwrap()
);
@@ -316,7 +466,8 @@ mod tests {
Field::new("c1", DataType::Utf8, false),
Field::new("c2", DataType::Float64, true),
Field::new("c3", DataType::UInt32, false),
- Field::new("c3", DataType::Boolean, true),
+ Field::new("c4", DataType::Boolean, true),
+ Field::new("c6", DataType::Time32(TimeUnit::Second), false),
]);
let c1 = BinaryArray::from(vec![
@@ -331,16 +482,26 @@ mod tests {
]);
let c3 = PrimitiveArray::<UInt32Type>::from(vec![3, 2, 1]);
let c4 = PrimitiveArray::<BooleanType>::from(vec![Some(true), Some(false), None]);
+ let c6 = Time32SecondArray::from(vec![1234, 24680, 85563]);
let batch = RecordBatch::try_new(
Arc::new(schema),
- vec![Arc::new(c1), Arc::new(c2), Arc::new(c3), Arc::new(c4)],
+ vec![
+ Arc::new(c1),
+ Arc::new(c2),
+ Arc::new(c3),
+ Arc::new(c4),
+ Arc::new(c6),
+ ],
)
.unwrap();
let file = get_temp_file("custom_options.csv", &[]);
- let builder = WriterBuilder::new().has_headers(false).with_delimiter(b'|');
+ let builder = WriterBuilder::new()
+ .has_headers(false)
+ .with_delimiter(b'|')
+ .with_time_format("%r".to_string());
let writer = builder.build(file);
writer.write(vec![&batch]).unwrap();
@@ -351,7 +512,7 @@ mod tests {
file.read_to_end(&mut buffer).unwrap();
assert_eq!(
- "Lorem ipsum dolor sit amet|123.564532|3|true\nconsectetur adipiscing elit||2|false\nsed do eiusmod tempor|-556132.25|1|\n"
+ "Lorem ipsum dolor sit amet|123.564532|3|true|12:20:34 AM\nconsectetur adipiscing elit||2|false|06:51:20 AM\nsed do eiusmod tempor|-556132.25|1||11:46:03 PM\n"
.to_string(),
String::from_utf8(buffer).unwrap()
);