You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "tustvold (via GitHub)" <gi...@apache.org> on 2023/03/06 09:00:33 UTC
[GitHub] [arrow-rs] tustvold commented on a diff in pull request #3805: Support reading decimal arrays from json

tustvold commented on code in PR #3805:
URL: https://github.com/apache/arrow-rs/pull/3805#discussion_r1126099715


##########
arrow-array/src/types.rs:
##########
@@ -699,6 +701,132 @@ fn format_decimal_str(value_str: &str, precision: usize, scale: i8) -> String {
     }
 }
 
+lazy_static! {
+    static ref PARSE_DECIMAL_RE: Regex =
+        Regex::new(r"^-?(\d+\.?\d*|\d*\.?\d+)$").unwrap();
+}
+
+/// Parse the string format decimal value to i128/i256 format and checking the precision and scale.
+/// The result value can't be out of bounds.
+pub fn parse_decimal_with_parameter<T: DecimalType>(

Review Comment:
   Could we put this in arrow-cast/parse instead



##########
arrow-array/src/types.rs:
##########
@@ -699,6 +701,132 @@ fn format_decimal_str(value_str: &str, precision: usize, scale: i8) -> String {
     }
 }
 
+lazy_static! {
+    static ref PARSE_DECIMAL_RE: Regex =

Review Comment:
   This seems like quite a heavy way to achieve this, I wonder if we could avoid bringing this dependency in with a simple for loop over the string bytes?



##########
arrow-array/src/types.rs:
##########
@@ -699,6 +701,132 @@ fn format_decimal_str(value_str: &str, precision: usize, scale: i8) -> String {
     }
 }
 
+lazy_static! {
+    static ref PARSE_DECIMAL_RE: Regex =
+        Regex::new(r"^-?(\d+\.?\d*|\d*\.?\d+)$").unwrap();
+}
+
+/// Parse the string format decimal value to i128/i256 format and checking the precision and scale.
+/// The result value can't be out of bounds.
+pub fn parse_decimal_with_parameter<T: DecimalType>(
+    s: &str,
+    precision: u8,
+    scale: i8,
+) -> Result<T::Native, ArrowError> {
+    if PARSE_DECIMAL_RE.is_match(s) {
+        let mut offset = s.len();
+        let len = s.len();
+        let mut base = T::Native::usize_as(1);
+        let scale_usize = usize::from(scale as u8);
+
+        // handle the value after the '.' and meet the scale
+        let delimiter_position = s.find('.');
+        match delimiter_position {
+            None => {
+                // there is no '.'
+                base = T::Native::usize_as(10).pow_checked(scale as u32)?;
+            }
+            Some(mid) => {
+                // there is the '.'
+                if len - mid >= scale_usize + 1 {
+                    // If the string value is "123.12345" and the scale is 2, we should just remain '.12' and drop the '345' value.
+                    offset -= len - mid - 1 - scale_usize;
+                } else {
+                    // If the string value is "123.12" and the scale is 4, we should append '00' to the tail.
+                    base = T::Native::usize_as(10)
+                        .pow_checked((scale_usize + 1 + mid - len) as u32)?;
+                }
+            }
+        };
+
+        // each byte is digit、'-' or '.'
+        let bytes = s.as_bytes();
+        let mut negative = false;
+        let mut result = T::Native::usize_as(0);
+
+        bytes[0..offset]
+            .iter()
+            .rev()
+            .try_for_each::<_, Result<(), ArrowError>>(|&byte| {
+                match byte {
+                    b'-' => {
+                        negative = true;
+                    }
+                    b'0'..=b'9' => {
+                        let add = T::Native::usize_as((byte - b'0') as usize)
+                            .mul_checked(base)?;
+                        result = result.add_checked(add)?;
+                        base = base.mul_checked(T::Native::usize_as(10))?;
+                    }
+                    // because of the PARSE_DECIMAL_RE, bytes just contains digit、'-' and '.'.
+                    _ => (),
+                }
+                Ok(())
+            })?;
+
+        if negative {
+            result = result.neg_checked()?;
+        }
+
+        match T::validate_decimal_precision(result, precision) {
+            Ok(_) => Ok(result),
+            Err(e) => Err(ArrowError::ParseError(format!(
+                "parse decimal overflow: {e}"
+            ))),
+        }
+    } else {
+        Err(ArrowError::ParseError(format!(
+            "can't parse the string value {s} to decimal"
+        )))
+    }
+}
+
+// Parse the string format decimal value to i128 format without checking the precision and scale.
+// Like "125.12" to 12512_i128.
+#[cfg(test)]

Review Comment:
   Could we drop this and rename the above method to parse_decimal?



##########
arrow-array/src/types.rs:
##########
@@ -699,6 +701,132 @@ fn format_decimal_str(value_str: &str, precision: usize, scale: i8) -> String {
     }
 }
 
+lazy_static! {
+    static ref PARSE_DECIMAL_RE: Regex =
+        Regex::new(r"^-?(\d+\.?\d*|\d*\.?\d+)$").unwrap();
+}
+
+/// Parse the string format decimal value to i128/i256 format and checking the precision and scale.
+/// The result value can't be out of bounds.
+pub fn parse_decimal_with_parameter<T: DecimalType>(
+    s: &str,
+    precision: u8,
+    scale: i8,
+) -> Result<T::Native, ArrowError> {
+    if PARSE_DECIMAL_RE.is_match(s) {
+        let mut offset = s.len();
+        let len = s.len();
+        let mut base = T::Native::usize_as(1);
+        let scale_usize = usize::from(scale as u8);
+
+        // handle the value after the '.' and meet the scale
+        let delimiter_position = s.find('.');
+        match delimiter_position {
+            None => {
+                // there is no '.'
+                base = T::Native::usize_as(10).pow_checked(scale as u32)?;
+            }
+            Some(mid) => {
+                // there is the '.'
+                if len - mid >= scale_usize + 1 {
+                    // If the string value is "123.12345" and the scale is 2, we should just remain '.12' and drop the '345' value.
+                    offset -= len - mid - 1 - scale_usize;
+                } else {
+                    // If the string value is "123.12" and the scale is 4, we should append '00' to the tail.
+                    base = T::Native::usize_as(10)
+                        .pow_checked((scale_usize + 1 + mid - len) as u32)?;
+                }
+            }
+        };
+
+        // each byte is digit、'-' or '.'
+        let bytes = s.as_bytes();
+        let mut negative = false;
+        let mut result = T::Native::usize_as(0);
+
+        bytes[0..offset]
+            .iter()
+            .rev()
+            .try_for_each::<_, Result<(), ArrowError>>(|&byte| {
+                match byte {
+                    b'-' => {
+                        negative = true;
+                    }
+                    b'0'..=b'9' => {
+                        let add = T::Native::usize_as((byte - b'0') as usize)
+                            .mul_checked(base)?;
+                        result = result.add_checked(add)?;
+                        base = base.mul_checked(T::Native::usize_as(10))?;
+                    }
+                    // because of the PARSE_DECIMAL_RE, bytes just contains digit、'-' and '.'.
+                    _ => (),
+                }
+                Ok(())
+            })?;
+
+        if negative {
+            result = result.neg_checked()?;
+        }
+
+        match T::validate_decimal_precision(result, precision) {
+            Ok(_) => Ok(result),
+            Err(e) => Err(ArrowError::ParseError(format!(
+                "parse decimal overflow: {e}"
+            ))),
+        }
+    } else {
+        Err(ArrowError::ParseError(format!(
+            "can't parse the string value {s} to decimal"
+        )))
+    }
+}
+
+// Parse the string format decimal value to i128 format without checking the precision and scale.
+// Like "125.12" to 12512_i128.
+#[cfg(test)]
+fn parse_decimal(s: &str) -> Result<i128, ArrowError> {
+    use std::ops::Neg;
+
+    if PARSE_DECIMAL_RE.is_match(s) {
+        let mut offset = s.len();
+        // each byte is digit、'-' or '.'
+        let bytes = s.as_bytes();
+        let mut negative = false;
+        let mut result: i128 = 0;
+        let mut base = 1;
+        while offset > 0 {
+            match bytes[offset - 1] {
+                b'-' => {
+                    negative = true;
+                }
+                b'.' => {
+                    // do nothing

Review Comment:
   It occurs to me if this checked that the decimal only appears once, we could drop the regex



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org