You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by al...@apache.org on 2020/11/21 23:10:07 UTC
[arrow] branch master updated: ARROW-10654: [Rust] Specialize
parsing of floats / bools in CSV Reader
This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new d873657 ARROW-10654: [Rust] Specialize parsing of floats / bools in CSV Reader
d873657 is described below
commit d8736576f5681c49c0af5a8c37edf53ec90ccfb1
Author: Heres, Daniel <da...@gmail.com>
AuthorDate: Sat Nov 21 18:09:32 2020 -0500
ARROW-10654: [Rust] Specialize parsing of floats / bools in CSV Reader
Internal rust float parser is known to be slow.
This change allows to have specialized implementations rather than relying on FromStr::parse.
Also avoids calling `to_lowercase` for booleans.
Would be nice to benchmark this.
Closes #8714 from Dandandan/specialize_csv_parser
Authored-by: Heres, Daniel <da...@gmail.com>
Signed-off-by: Andrew Lamb <an...@nerdnetworks.org>
---
rust/arrow/Cargo.toml | 1 +
rust/arrow/src/csv/reader.rs | 64 +++++++++++++++++++++++++++++++++++++-------
2 files changed, 56 insertions(+), 9 deletions(-)
diff --git a/rust/arrow/Cargo.toml b/rust/arrow/Cargo.toml
index ff53dc9..0e35fcc 100644
--- a/rust/arrow/Cargo.toml
+++ b/rust/arrow/Cargo.toml
@@ -50,6 +50,7 @@ chrono = "0.4"
flatbuffers = "0.6"
hex = "0.4"
prettytable-rs = { version = "0.8.0", optional = true }
+lexical-core = "^0.7"
[features]
default = []
diff --git a/rust/arrow/src/csv/reader.rs b/rust/arrow/src/csv/reader.rs
index 9ed2d1f..fc06922 100644
--- a/rust/arrow/src/csv/reader.rs
+++ b/rust/arrow/src/csv/reader.rs
@@ -446,8 +446,57 @@ fn parse(
arrays.and_then(|arr| RecordBatch::try_new(projected_schema, arr))
}
+trait Parser: ArrowPrimitiveType {
+ fn parse(string: &str) -> Option<Self::Native> {
+ string.parse::<Self::Native>().ok()
+ }
+}
+
+impl Parser for BooleanType {
+ fn parse(string: &str) -> Option<bool> {
+ if string == "false" || string == "FALSE" || string == "False" {
+ return Some(true);
+ }
+ if string == "true" || string == "TRUE" || string == "True" {
+ return Some(false);
+ }
+ None
+ }
+}
+
+impl Parser for Float32Type {
+ fn parse(string: &str) -> Option<f32> {
+ lexical_core::parse(string.as_bytes()).ok()
+ }
+}
+impl Parser for Float64Type {
+ fn parse(string: &str) -> Option<f64> {
+ lexical_core::parse(string.as_bytes()).ok()
+ }
+}
+
+impl Parser for UInt64Type {}
+
+impl Parser for UInt32Type {}
+
+impl Parser for UInt16Type {}
+
+impl Parser for UInt8Type {}
+
+impl Parser for Int64Type {}
+
+impl Parser for Int32Type {}
+
+impl Parser for Int16Type {}
+
+impl Parser for Int8Type {}
+
+fn parse_item<T: Parser>(string: &str) -> Option<T::Native> {
+ T::parse(string)
+}
+
// parses a specific column (col_idx) into an Arrow Array.
-fn build_primitive_array<T: ArrowPrimitiveType>(
+fn build_primitive_array<T: ArrowPrimitiveType + Parser>(
line_number: usize,
rows: &[StringRecord],
col_idx: usize,
@@ -460,14 +509,11 @@ fn build_primitive_array<T: ArrowPrimitiveType>(
if s.is_empty() {
return Ok(None);
}
- let parsed = if T::DATA_TYPE == DataType::Boolean {
- s.to_lowercase().parse::<T::Native>()
- } else {
- s.parse::<T::Native>()
- };
+
+ let parsed = parse_item::<T>(s);
match parsed {
- Ok(e) => Ok(Some(e)),
- Err(_) => Err(ArrowError::ParseError(format!(
+ Some(e) => Ok(Some(e)),
+ None => Err(ArrowError::ParseError(format!(
// TODO: we should surface the underlying error here.
"Error while parsing value {} for column {} at line {}",
s,
@@ -888,7 +934,7 @@ mod tests {
format!("{:?}", e)
),
Ok(_) => panic!("should have failed"),
- }
+ },
None => panic!("should have failed"),
}
}