You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by ks...@apache.org on 2018/12/19 12:57:46 UTC

[arrow] branch master updated: ARROW-3989: [Rust] [CSV] Cast bool string to lower case in reader

This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new b8d4477  ARROW-3989: [Rust] [CSV] Cast bool string to lower case in reader
b8d4477 is described below

commit b8d4477ffbe5a569521828964277e7d6ea115671
Author: Neville Dipale <ne...@gmail.com>
AuthorDate: Wed Dec 19 13:57:13 2018 +0100

    ARROW-3989: [Rust] [CSV] Cast bool string to lower case in reader
    
    The csv reader currently only handles boolean types if the string is explicitly `true|false`. Excel saves bools as `TRUE|FALSE`, and Python/Pandas as `True|False`.
    
    This PR adds a condition that lowercases booleans when casting them to Arrow types.
    
    @andygrove @sunchao I believe it's ready for review.
    
    Author: Neville Dipale <ne...@gmail.com>
    
    Closes #3214 from nevi-me/rust/boolean-case and squashes the following commits:
    
    38d99426 <Neville Dipale> move primitive array builder into Reader
    9fae4428 <Neville Dipale> move is_boolean_type check out of loop, remove duplicate impl Reader
    2a86b527 <Neville Dipale> : Cast timestamp string to lower case to handle True, TRUE ...
---
 rust/src/csv/reader.rs       | 79 ++++++++++++++++++++++++--------------------
 rust/test/data/null_test.csv |  8 ++---
 2 files changed, 47 insertions(+), 40 deletions(-)

diff --git a/rust/src/csv/reader.rs b/rust/src/csv/reader.rs
index 632aa7a..b9c46fc 100644
--- a/rust/src/csv/reader.rs
+++ b/rust/src/csv/reader.rs
@@ -87,32 +87,7 @@ impl Reader {
             batch_size,
         }
     }
-}
 
-fn build_primitive_array<T: ArrowPrimitiveType>(
-    rows: &[StringRecord],
-    col_idx: &usize,
-) -> Result<ArrayRef> {
-    let mut builder = PrimitiveArrayBuilder::<T>::new(rows.len());
-    for row_index in 0..rows.len() {
-        match rows[row_index].get(*col_idx) {
-            Some(s) if s.len() > 0 => match s.parse::<T::Native>() {
-                Ok(v) => builder.push(v)?,
-                Err(_) => {
-                    // TODO: we should surface the underlying error here.
-                    return Err(ArrowError::ParseError(format!(
-                        "Error while parsing value {}",
-                        s
-                    )));
-                }
-            },
-            _ => builder.push_null().unwrap(),
-        }
-    }
-    Ok(Arc::new(builder.finish()) as ArrayRef)
-}
-
-impl Reader {
     /// Read the next batch of rows
     pub fn next(&mut self) -> Result<Option<RecordBatch>> {
         // read a batch of rows into memory
@@ -151,17 +126,17 @@ impl Reader {
             .map(|i| {
                 let field = self.schema.field(*i);
                 match field.data_type() {
-                    &DataType::Boolean => build_primitive_array::<BooleanType>(rows, i),
-                    &DataType::Int8 => build_primitive_array::<Int8Type>(rows, i),
-                    &DataType::Int16 => build_primitive_array::<Int16Type>(rows, i),
-                    &DataType::Int32 => build_primitive_array::<Int32Type>(rows, i),
-                    &DataType::Int64 => build_primitive_array::<Int64Type>(rows, i),
-                    &DataType::UInt8 => build_primitive_array::<UInt8Type>(rows, i),
-                    &DataType::UInt16 => build_primitive_array::<UInt16Type>(rows, i),
-                    &DataType::UInt32 => build_primitive_array::<UInt32Type>(rows, i),
-                    &DataType::UInt64 => build_primitive_array::<UInt64Type>(rows, i),
-                    &DataType::Float32 => build_primitive_array::<Float32Type>(rows, i),
-                    &DataType::Float64 => build_primitive_array::<Float64Type>(rows, i),
+                    &DataType::Boolean => self.build_primitive_array::<BooleanType>(rows, i),
+                    &DataType::Int8 => self.build_primitive_array::<Int8Type>(rows, i),
+                    &DataType::Int16 => self.build_primitive_array::<Int16Type>(rows, i),
+                    &DataType::Int32 => self.build_primitive_array::<Int32Type>(rows, i),
+                    &DataType::Int64 => self.build_primitive_array::<Int64Type>(rows, i),
+                    &DataType::UInt8 => self.build_primitive_array::<UInt8Type>(rows, i),
+                    &DataType::UInt16 => self.build_primitive_array::<UInt16Type>(rows, i),
+                    &DataType::UInt32 => self.build_primitive_array::<UInt32Type>(rows, i),
+                    &DataType::UInt64 => self.build_primitive_array::<UInt64Type>(rows, i),
+                    &DataType::Float32 => self.build_primitive_array::<Float32Type>(rows, i),
+                    &DataType::Float64 => self.build_primitive_array::<Float64Type>(rows, i),
                     &DataType::Utf8 => {
                         let values_builder: UInt8Builder = UInt8Builder::new(rows.len());
                         let mut list_builder = ListArrayBuilder::new(values_builder);
@@ -191,6 +166,38 @@ impl Reader {
             Err(e) => Err(e),
         }
     }
+
+    fn build_primitive_array<T: ArrowPrimitiveType>(
+        &self,
+        rows: &[StringRecord],
+        col_idx: &usize,
+    ) -> Result<ArrayRef> {
+        let mut builder = PrimitiveArrayBuilder::<T>::new(rows.len());
+        let is_boolean_type = *self.schema.field(*col_idx).data_type() == DataType::Boolean;
+        for row_index in 0..rows.len() {
+            match rows[row_index].get(*col_idx) {
+                Some(s) if s.len() > 0 => {
+                    let t = if is_boolean_type {
+                        s.to_lowercase().parse::<T::Native>()
+                    } else {
+                        s.parse::<T::Native>()
+                    };
+                    match t {
+                        Ok(v) => builder.push(v)?,
+                        Err(_) => {
+                            // TODO: we should surface the underlying error here.
+                            return Err(ArrowError::ParseError(format!(
+                                "Error while parsing value {}",
+                                s
+                            )));
+                        }
+                    }
+                }
+                _ => builder.push_null()?,
+            }
+        }
+        Ok(Arc::new(builder.finish()) as ArrayRef)
+    }
 }
 
 #[cfg(test)]
diff --git a/rust/test/data/null_test.csv b/rust/test/data/null_test.csv
index 8083060..7e0dde5 100644
--- a/rust/test/data/null_test.csv
+++ b/rust/test/data/null_test.csv
@@ -1,6 +1,6 @@
 c_int,c_float,c_string,c_bool
-1,1.1,"1.11",true
-2,2.2,"2.22",true
+1,1.1,"1.11",True
+2,2.2,"2.22",TRUE
 3,,"3.33",true
-4,4.4,,false
-5,6.6,"",false
\ No newline at end of file
+4,4.4,,False
+5,6.6,"",FALSE
\ No newline at end of file