You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/04/11 13:33:26 UTC

[arrow-rs] branch master updated: Fix precision loss in Raw JSON decoder (#4049) (#4051)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 6b17775f3 Fix precision loss in Raw JSON decoder (#4049) (#4051)
6b17775f3 is described below

commit 6b17775f37b939221d855514db4ffb3344deb1f4
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Tue Apr 11 14:33:19 2023 +0100

    Fix precision loss in Raw JSON decoder (#4049) (#4051)
---
 arrow-json/src/raw/mod.rs             | 24 ++++++++++++++++++
 arrow-json/src/raw/primitive_array.rs | 47 +++++++++++++++++++++++++++++++----
 2 files changed, 66 insertions(+), 5 deletions(-)

diff --git a/arrow-json/src/raw/mod.rs b/arrow-json/src/raw/mod.rs
index c19552476..38b4cce9b 100644
--- a/arrow-json/src/raw/mod.rs
+++ b/arrow-json/src/raw/mod.rs
@@ -1375,4 +1375,28 @@ mod tests {
             Some("+00:00".into()),
         ));
     }
+
+    #[test]
+    fn test_truncation() {
+        let buf = r#"
+        {"i64": 9223372036854775807, "u64": 18446744073709551615 }
+        {"i64": "9223372036854775807", "u64": "18446744073709551615" }
+        {"i64": -9223372036854775808, "u64": 0 }
+        {"i64": "-9223372036854775808", "u64": 0 }
+        "#;
+
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("i64", DataType::Int64, true),
+            Field::new("u64", DataType::UInt64, true),
+        ]));
+
+        let batches = do_read(buf, 1024, true, schema);
+        assert_eq!(batches.len(), 1);
+
+        let i64 = batches[0].column(0).as_primitive::<Int64Type>();
+        assert_eq!(i64.values(), &[i64::MAX, i64::MAX, i64::MIN, i64::MIN]);
+
+        let u64 = batches[0].column(1).as_primitive::<UInt64Type>();
+        assert_eq!(u64.values(), &[u64::MAX, u64::MAX, u64::MIN, u64::MIN]);
+    }
 }
diff --git a/arrow-json/src/raw/primitive_array.rs b/arrow-json/src/raw/primitive_array.rs
index 72ce30203..6985821d6 100644
--- a/arrow-json/src/raw/primitive_array.rs
+++ b/arrow-json/src/raw/primitive_array.rs
@@ -27,6 +27,45 @@ use arrow_schema::{ArrowError, DataType};
 use crate::raw::tape::{Tape, TapeElement};
 use crate::raw::{tape_error, ArrayDecoder};
 
+/// A trait for JSON-specific primitive parsing logic
+///
+/// According to the specification unquoted fields should be parsed as a double-precision
+/// floating point numbers, including scientific representation such as `2e3`
+///
+/// In practice, it is common to serialize numbers outside the range of an `f64` and expect
+/// them to round-trip correctly. As such when parsing integers we first parse as the integer
+/// and fallback to parsing as a floating point if this fails
+trait ParseJsonNumber: Sized {
+    fn parse(s: &[u8]) -> Option<Self>;
+}
+
+macro_rules! primitive_parse {
+    ($($t:ty),+) => {
+        $(impl ParseJsonNumber for $t {
+            fn parse(s: &[u8]) -> Option<Self> {
+                match lexical_core::parse::<Self>(s) {
+                    Ok(f) => Some(f),
+                    Err(_) => lexical_core::parse::<f64>(s).ok().and_then(NumCast::from),
+                }
+            }
+        })+
+    };
+}
+
+primitive_parse!(i8, i16, i32, i64, u8, u16, u32, u64);
+
+impl ParseJsonNumber for f32 {
+    fn parse(s: &[u8]) -> Option<Self> {
+        lexical_core::parse::<Self>(s).ok()
+    }
+}
+
+impl ParseJsonNumber for f64 {
+    fn parse(s: &[u8]) -> Option<Self> {
+        lexical_core::parse::<Self>(s).ok()
+    }
+}
+
 pub struct PrimitiveArrayDecoder<P: ArrowPrimitiveType> {
     data_type: DataType,
     // Invariant and Send
@@ -45,7 +84,7 @@ impl<P: ArrowPrimitiveType> PrimitiveArrayDecoder<P> {
 impl<P> ArrayDecoder for PrimitiveArrayDecoder<P>
 where
     P: ArrowPrimitiveType + Parser,
-    P::Native: NumCast,
+    P::Native: ParseJsonNumber,
 {
     fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> {
         let mut builder = PrimitiveBuilder::<P>::with_capacity(pos.len())
@@ -67,10 +106,8 @@ where
                 }
                 TapeElement::Number(idx) => {
                     let s = tape.get_string(idx);
-                    let value = lexical_core::parse::<f64>(s.as_bytes())
-                        .ok()
-                        .and_then(NumCast::from)
-                        .ok_or_else(|| {
+                    let value =
+                        ParseJsonNumber::parse(s.as_bytes()).ok_or_else(|| {
                             ArrowError::JsonError(format!(
                                 "failed to parse {s} as {}",
                                 self.data_type