You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2018/12/19 12:57:46 UTC
[arrow] branch master updated: ARROW-3989: [Rust] [CSV] Cast bool
string to lower case in reader
This is an automated email from the ASF dual-hosted git repository.
kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new b8d4477 ARROW-3989: [Rust] [CSV] Cast bool string to lower case in reader
b8d4477 is described below
commit b8d4477ffbe5a569521828964277e7d6ea115671
Author: Neville Dipale <ne...@gmail.com>
AuthorDate: Wed Dec 19 13:57:13 2018 +0100
ARROW-3989: [Rust] [CSV] Cast bool string to lower case in reader
The csv reader currently only handles boolean types if the string is explicitly `true|false`. Excel saves bools as `TRUE|FALSE`, and Python/Pandas as `True|False`.
This PR adds a condition that lowercases booleans when casting them to Arrow types.
@andygrove @sunchao I believe it's ready for review.
Author: Neville Dipale <ne...@gmail.com>
Closes #3214 from nevi-me/rust/boolean-case and squashes the following commits:
38d99426 <Neville Dipale> move primitive array builder into Reader
9fae4428 <Neville Dipale> move is_boolean_type check out of loop, remove duplicate impl Reader
2a86b527 <Neville Dipale> : Cast timestamp string to lower case to handle True, TRUE ...
---
rust/src/csv/reader.rs | 79 ++++++++++++++++++++++++--------------------
rust/test/data/null_test.csv | 8 ++---
2 files changed, 47 insertions(+), 40 deletions(-)
diff --git a/rust/src/csv/reader.rs b/rust/src/csv/reader.rs
index 632aa7a..b9c46fc 100644
--- a/rust/src/csv/reader.rs
+++ b/rust/src/csv/reader.rs
@@ -87,32 +87,7 @@ impl Reader {
batch_size,
}
}
-}
-fn build_primitive_array<T: ArrowPrimitiveType>(
- rows: &[StringRecord],
- col_idx: &usize,
-) -> Result<ArrayRef> {
- let mut builder = PrimitiveArrayBuilder::<T>::new(rows.len());
- for row_index in 0..rows.len() {
- match rows[row_index].get(*col_idx) {
- Some(s) if s.len() > 0 => match s.parse::<T::Native>() {
- Ok(v) => builder.push(v)?,
- Err(_) => {
- // TODO: we should surface the underlying error here.
- return Err(ArrowError::ParseError(format!(
- "Error while parsing value {}",
- s
- )));
- }
- },
- _ => builder.push_null().unwrap(),
- }
- }
- Ok(Arc::new(builder.finish()) as ArrayRef)
-}
-
-impl Reader {
/// Read the next batch of rows
pub fn next(&mut self) -> Result<Option<RecordBatch>> {
// read a batch of rows into memory
@@ -151,17 +126,17 @@ impl Reader {
.map(|i| {
let field = self.schema.field(*i);
match field.data_type() {
- &DataType::Boolean => build_primitive_array::<BooleanType>(rows, i),
- &DataType::Int8 => build_primitive_array::<Int8Type>(rows, i),
- &DataType::Int16 => build_primitive_array::<Int16Type>(rows, i),
- &DataType::Int32 => build_primitive_array::<Int32Type>(rows, i),
- &DataType::Int64 => build_primitive_array::<Int64Type>(rows, i),
- &DataType::UInt8 => build_primitive_array::<UInt8Type>(rows, i),
- &DataType::UInt16 => build_primitive_array::<UInt16Type>(rows, i),
- &DataType::UInt32 => build_primitive_array::<UInt32Type>(rows, i),
- &DataType::UInt64 => build_primitive_array::<UInt64Type>(rows, i),
- &DataType::Float32 => build_primitive_array::<Float32Type>(rows, i),
- &DataType::Float64 => build_primitive_array::<Float64Type>(rows, i),
+ &DataType::Boolean => self.build_primitive_array::<BooleanType>(rows, i),
+ &DataType::Int8 => self.build_primitive_array::<Int8Type>(rows, i),
+ &DataType::Int16 => self.build_primitive_array::<Int16Type>(rows, i),
+ &DataType::Int32 => self.build_primitive_array::<Int32Type>(rows, i),
+ &DataType::Int64 => self.build_primitive_array::<Int64Type>(rows, i),
+ &DataType::UInt8 => self.build_primitive_array::<UInt8Type>(rows, i),
+ &DataType::UInt16 => self.build_primitive_array::<UInt16Type>(rows, i),
+ &DataType::UInt32 => self.build_primitive_array::<UInt32Type>(rows, i),
+ &DataType::UInt64 => self.build_primitive_array::<UInt64Type>(rows, i),
+ &DataType::Float32 => self.build_primitive_array::<Float32Type>(rows, i),
+ &DataType::Float64 => self.build_primitive_array::<Float64Type>(rows, i),
&DataType::Utf8 => {
let values_builder: UInt8Builder = UInt8Builder::new(rows.len());
let mut list_builder = ListArrayBuilder::new(values_builder);
@@ -191,6 +166,38 @@ impl Reader {
Err(e) => Err(e),
}
}
+
+ fn build_primitive_array<T: ArrowPrimitiveType>(
+ &self,
+ rows: &[StringRecord],
+ col_idx: &usize,
+ ) -> Result<ArrayRef> {
+ let mut builder = PrimitiveArrayBuilder::<T>::new(rows.len());
+ let is_boolean_type = *self.schema.field(*col_idx).data_type() == DataType::Boolean;
+ for row_index in 0..rows.len() {
+ match rows[row_index].get(*col_idx) {
+ Some(s) if s.len() > 0 => {
+ let t = if is_boolean_type {
+ s.to_lowercase().parse::<T::Native>()
+ } else {
+ s.parse::<T::Native>()
+ };
+ match t {
+ Ok(v) => builder.push(v)?,
+ Err(_) => {
+ // TODO: we should surface the underlying error here.
+ return Err(ArrowError::ParseError(format!(
+ "Error while parsing value {}",
+ s
+ )));
+ }
+ }
+ }
+ _ => builder.push_null()?,
+ }
+ }
+ Ok(Arc::new(builder.finish()) as ArrayRef)
+ }
}
#[cfg(test)]
diff --git a/rust/test/data/null_test.csv b/rust/test/data/null_test.csv
index 8083060..7e0dde5 100644
--- a/rust/test/data/null_test.csv
+++ b/rust/test/data/null_test.csv
@@ -1,6 +1,6 @@
c_int,c_float,c_string,c_bool
-1,1.1,"1.11",true
-2,2.2,"2.22",true
+1,1.1,"1.11",True
+2,2.2,"2.22",TRUE
3,,"3.33",true
-4,4.4,,false
-5,6.6,"",false
\ No newline at end of file
+4,4.4,,False
+5,6.6,"",FALSE
\ No newline at end of file