You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/04/11 13:33:26 UTC
[arrow-rs] branch master updated: Fix precision loss in Raw JSON decoder (#4049) (#4051)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 6b17775f3 Fix precision loss in Raw JSON decoder (#4049) (#4051)
6b17775f3 is described below
commit 6b17775f37b939221d855514db4ffb3344deb1f4
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Tue Apr 11 14:33:19 2023 +0100
Fix precision loss in Raw JSON decoder (#4049) (#4051)
---
arrow-json/src/raw/mod.rs | 24 ++++++++++++++++++
arrow-json/src/raw/primitive_array.rs | 47 +++++++++++++++++++++++++++++++----
2 files changed, 66 insertions(+), 5 deletions(-)
diff --git a/arrow-json/src/raw/mod.rs b/arrow-json/src/raw/mod.rs
index c19552476..38b4cce9b 100644
--- a/arrow-json/src/raw/mod.rs
+++ b/arrow-json/src/raw/mod.rs
@@ -1375,4 +1375,28 @@ mod tests {
Some("+00:00".into()),
));
}
+
+ #[test]
+ fn test_truncation() {
+ let buf = r#"
+ {"i64": 9223372036854775807, "u64": 18446744073709551615 }
+ {"i64": "9223372036854775807", "u64": "18446744073709551615" }
+ {"i64": -9223372036854775808, "u64": 0 }
+ {"i64": "-9223372036854775808", "u64": 0 }
+ "#;
+
+ let schema = Arc::new(Schema::new(vec![
+ Field::new("i64", DataType::Int64, true),
+ Field::new("u64", DataType::UInt64, true),
+ ]));
+
+ let batches = do_read(buf, 1024, true, schema);
+ assert_eq!(batches.len(), 1);
+
+ let i64 = batches[0].column(0).as_primitive::<Int64Type>();
+ assert_eq!(i64.values(), &[i64::MAX, i64::MAX, i64::MIN, i64::MIN]);
+
+ let u64 = batches[0].column(1).as_primitive::<UInt64Type>();
+ assert_eq!(u64.values(), &[u64::MAX, u64::MAX, u64::MIN, u64::MIN]);
+ }
}
diff --git a/arrow-json/src/raw/primitive_array.rs b/arrow-json/src/raw/primitive_array.rs
index 72ce30203..6985821d6 100644
--- a/arrow-json/src/raw/primitive_array.rs
+++ b/arrow-json/src/raw/primitive_array.rs
@@ -27,6 +27,45 @@ use arrow_schema::{ArrowError, DataType};
use crate::raw::tape::{Tape, TapeElement};
use crate::raw::{tape_error, ArrayDecoder};
+/// A trait for JSON-specific primitive parsing logic
+///
+/// According to the specification unquoted fields should be parsed as a double-precision
+/// floating point numbers, including scientific representation such as `2e3`
+///
+/// In practice, it is common to serialize numbers outside the range of an `f64` and expect
+/// them to round-trip correctly. As such when parsing integers we first parse as the integer
+/// and fallback to parsing as a floating point if this fails
+trait ParseJsonNumber: Sized {
+ fn parse(s: &[u8]) -> Option<Self>;
+}
+
+macro_rules! primitive_parse {
+ ($($t:ty),+) => {
+ $(impl ParseJsonNumber for $t {
+ fn parse(s: &[u8]) -> Option<Self> {
+ match lexical_core::parse::<Self>(s) {
+ Ok(f) => Some(f),
+ Err(_) => lexical_core::parse::<f64>(s).ok().and_then(NumCast::from),
+ }
+ }
+ })+
+ };
+}
+
+primitive_parse!(i8, i16, i32, i64, u8, u16, u32, u64);
+
+impl ParseJsonNumber for f32 {
+ fn parse(s: &[u8]) -> Option<Self> {
+ lexical_core::parse::<Self>(s).ok()
+ }
+}
+
+impl ParseJsonNumber for f64 {
+ fn parse(s: &[u8]) -> Option<Self> {
+ lexical_core::parse::<Self>(s).ok()
+ }
+}
+
pub struct PrimitiveArrayDecoder<P: ArrowPrimitiveType> {
data_type: DataType,
// Invariant and Send
@@ -45,7 +84,7 @@ impl<P: ArrowPrimitiveType> PrimitiveArrayDecoder<P> {
impl<P> ArrayDecoder for PrimitiveArrayDecoder<P>
where
P: ArrowPrimitiveType + Parser,
- P::Native: NumCast,
+ P::Native: ParseJsonNumber,
{
fn decode(&mut self, tape: &Tape<'_>, pos: &[u32]) -> Result<ArrayData, ArrowError> {
let mut builder = PrimitiveBuilder::<P>::with_capacity(pos.len())
@@ -67,10 +106,8 @@ where
}
TapeElement::Number(idx) => {
let s = tape.get_string(idx);
- let value = lexical_core::parse::<f64>(s.as_bytes())
- .ok()
- .and_then(NumCast::from)
- .ok_or_else(|| {
+ let value =
+ ParseJsonNumber::parse(s.as_bytes()).ok_or_else(|| {
ArrowError::JsonError(format!(
"failed to parse {s} as {}",
self.data_type