You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2020/11/21 23:10:07 UTC

[arrow] branch master updated: ARROW-10654: [Rust] Specialize parsing of floats / bools in CSV Reader

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new d873657  ARROW-10654: [Rust] Specialize parsing of floats / bools in CSV Reader
d873657 is described below

commit d8736576f5681c49c0af5a8c37edf53ec90ccfb1
Author: Heres, Daniel <da...@gmail.com>
AuthorDate: Sat Nov 21 18:09:32 2020 -0500

    ARROW-10654: [Rust] Specialize parsing of floats / bools in CSV Reader
    
    Internal rust float parser is known to be slow.
    
    This change allows to have specialized implementations rather than relying on FromStr::parse.
    
    Also avoids calling `to_lowercase` for booleans.
    
    Would be nice to benchmark this.
    
    Closes #8714 from Dandandan/specialize_csv_parser
    
    Authored-by: Heres, Daniel <da...@gmail.com>
    Signed-off-by: Andrew Lamb <an...@nerdnetworks.org>
---
 rust/arrow/Cargo.toml        |  1 +
 rust/arrow/src/csv/reader.rs | 64 +++++++++++++++++++++++++++++++++++++-------
 2 files changed, 56 insertions(+), 9 deletions(-)

diff --git a/rust/arrow/Cargo.toml b/rust/arrow/Cargo.toml
index ff53dc9..0e35fcc 100644
--- a/rust/arrow/Cargo.toml
+++ b/rust/arrow/Cargo.toml
@@ -50,6 +50,7 @@ chrono = "0.4"
 flatbuffers = "0.6"
 hex = "0.4"
 prettytable-rs = { version = "0.8.0", optional = true }
+lexical-core = "^0.7"
 
 [features]
 default = []
diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs
index 9ed2d1f..fc06922 100644
--- a/rust/arrow/src/csv/reader.rs
+++ b/rust/arrow/src/csv/reader.rs
@@ -446,8 +446,57 @@ fn parse(
     arrays.and_then(|arr| RecordBatch::try_new(projected_schema, arr))
 }
 
+trait Parser: ArrowPrimitiveType {
+    fn parse(string: &str) -> Option<Self::Native> {
+        string.parse::<Self::Native>().ok()
+    }
+}
+
+impl Parser for BooleanType {
+    fn parse(string: &str) -> Option<bool> {
+        if string == "false" || string == "FALSE" || string == "False" {
+            return Some(true);
+        }
+        if string == "true" || string == "TRUE" || string == "True" {
+            return Some(false);
+        }
+        None
+    }
+}
+
+impl Parser for Float32Type {
+    fn parse(string: &str) -> Option<f32> {
+        lexical_core::parse(string.as_bytes()).ok()
+    }
+}
+impl Parser for Float64Type {
+    fn parse(string: &str) -> Option<f64> {
+        lexical_core::parse(string.as_bytes()).ok()
+    }
+}
+
+impl Parser for UInt64Type {}
+
+impl Parser for UInt32Type {}
+
+impl Parser for UInt16Type {}
+
+impl Parser for UInt8Type {}
+
+impl Parser for Int64Type {}
+
+impl Parser for Int32Type {}
+
+impl Parser for Int16Type {}
+
+impl Parser for Int8Type {}
+
+fn parse_item<T: Parser>(string: &str) -> Option<T::Native> {
+    T::parse(string)
+}
+
 // parses a specific column (col_idx) into an Arrow Array.
-fn build_primitive_array<T: ArrowPrimitiveType>(
+fn build_primitive_array<T: ArrowPrimitiveType + Parser>(
     line_number: usize,
     rows: &[StringRecord],
     col_idx: usize,
@@ -460,14 +509,11 @@ fn build_primitive_array<T: ArrowPrimitiveType>(
                     if s.is_empty() {
                         return Ok(None);
                     }
-                    let parsed = if T::DATA_TYPE == DataType::Boolean {
-                        s.to_lowercase().parse::<T::Native>()
-                    } else {
-                        s.parse::<T::Native>()
-                    };
+
+                    let parsed = parse_item::<T>(s);
                     match parsed {
-                        Ok(e) => Ok(Some(e)),
-                        Err(_) => Err(ArrowError::ParseError(format!(
+                        Some(e) => Ok(Some(e)),
+                        None => Err(ArrowError::ParseError(format!(
                             // TODO: we should surface the underlying error here.
                             "Error while parsing value {} for column {} at line {}",
                             s,
@@ -888,7 +934,7 @@ mod tests {
                     format!("{:?}", e)
                 ),
                 Ok(_) => panic!("should have failed"),
-            }
+            },
             None => panic!("should have failed"),
         }
     }