You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/04/13 20:39:45 UTC

[arrow-rs] branch master updated: Improve JSON decoder errors (#4076) (#4079)

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 0121cdfad Improve JSON decoder errors (#4076) (#4079)
0121cdfad is described below

commit 0121cdfad5207bf4e8e1c4a7c20775297e4c5ae8
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Thu Apr 13 21:39:38 2023 +0100

    Improve JSON decoder errors (#4076) (#4079)
    
    * Improve JSON decoder errors (#4076)
    
    * Clippy
    
    * Review feedback
---
 arrow-json/src/reader/boolean_array.rs   |   4 +-
 arrow-json/src/reader/decimal_array.rs   |   4 +-
 arrow-json/src/reader/list_array.rs      |   8 +--
 arrow-json/src/reader/map_array.rs       |   8 +--
 arrow-json/src/reader/mod.rs             | 115 +++++++++++++++++++++++++++----
 arrow-json/src/reader/primitive_array.rs |   4 +-
 arrow-json/src/reader/string_array.rs    |   4 +-
 arrow-json/src/reader/struct_array.rs    |  21 ++++--
 arrow-json/src/reader/tape.rs            |  72 +++++++++++++------
 arrow-json/src/reader/timestamp_array.rs |   4 +-
 10 files changed, 183 insertions(+), 61 deletions(-)

diff --git a/arrow-json/src/reader/boolean_array.rs b/arrow-json/src/reader/boolean_array.rs
index 9a7f22680..9094391cd 100644
--- a/arrow-json/src/reader/boolean_array.rs
+++ b/arrow-json/src/reader/boolean_array.rs
@@ -21,7 +21,7 @@ use arrow_data::ArrayData;
 use arrow_schema::ArrowError;
 
 use crate::reader::tape::{Tape, TapeElement};
-use crate::reader::{tape_error, ArrayDecoder};
+use crate::reader::ArrayDecoder;
 
 #[derive(Default)]
 pub struct BooleanArrayDecoder {}
@@ -34,7 +34,7 @@ impl ArrayDecoder for BooleanArrayDecoder {
                 TapeElement::Null => builder.append_null(),
                 TapeElement::True => builder.append_value(true),
                 TapeElement::False => builder.append_value(false),
-                d => return Err(tape_error(d, "boolean")),
+                _ => return Err(tape.error(*p, "boolean")),
             }
         }
 
diff --git a/arrow-json/src/reader/decimal_array.rs b/arrow-json/src/reader/decimal_array.rs
index 508409ec7..fc3c9aaa6 100644
--- a/arrow-json/src/reader/decimal_array.rs
+++ b/arrow-json/src/reader/decimal_array.rs
@@ -25,7 +25,7 @@ use arrow_data::ArrayData;
 use arrow_schema::ArrowError;
 
 use crate::reader::tape::{Tape, TapeElement};
-use crate::reader::{tape_error, ArrayDecoder};
+use crate::reader::ArrayDecoder;
 
 pub struct DecimalArrayDecoder<D: DecimalType> {
     precision: u8,
@@ -64,7 +64,7 @@ where
                     let value = parse_decimal::<D>(s, self.precision, self.scale)?;
                     builder.append_value(value)
                 }
-                d => return Err(tape_error(d, "decimal")),
+                _ => return Err(tape.error(*p, "decimal")),
             }
         }
 
diff --git a/arrow-json/src/reader/list_array.rs b/arrow-json/src/reader/list_array.rs
index ac35f9988..aa3538bd5 100644
--- a/arrow-json/src/reader/list_array.rs
+++ b/arrow-json/src/reader/list_array.rs
@@ -16,7 +16,7 @@
 // under the License.
 
 use crate::reader::tape::{Tape, TapeElement};
-use crate::reader::{make_decoder, tape_error, ArrayDecoder};
+use crate::reader::{make_decoder, ArrayDecoder};
 use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder};
 use arrow_array::OffsetSizeTrait;
 use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
@@ -78,7 +78,7 @@ impl<O: OffsetSizeTrait> ArrayDecoder for ListArrayDecoder<O> {
                     nulls.append(false);
                     *p + 1
                 }
-                (d, _) => return Err(tape_error(d, "[")),
+                _ => return Err(tape.error(*p, "[")),
             };
 
             let mut cur_idx = *p + 1;
@@ -86,9 +86,7 @@ impl<O: OffsetSizeTrait> ArrayDecoder for ListArrayDecoder<O> {
                 child_pos.push(cur_idx);
 
                 // Advance to next field
-                cur_idx = tape
-                    .next(cur_idx)
-                    .map_err(|d| tape_error(d, "list value"))?;
+                cur_idx = tape.next(cur_idx, "list value")?;
             }
 
             let offset = O::from_usize(child_pos.len()).ok_or_else(|| {
diff --git a/arrow-json/src/reader/map_array.rs b/arrow-json/src/reader/map_array.rs
index 3662e594b..5e800a0d6 100644
--- a/arrow-json/src/reader/map_array.rs
+++ b/arrow-json/src/reader/map_array.rs
@@ -16,7 +16,7 @@
 // under the License.
 
 use crate::reader::tape::{Tape, TapeElement};
-use crate::reader::{make_decoder, tape_error, ArrayDecoder};
+use crate::reader::{make_decoder, ArrayDecoder};
 use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder};
 use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
 use arrow_buffer::ArrowNativeType;
@@ -104,14 +104,14 @@ impl ArrayDecoder for MapArrayDecoder {
                     nulls.append(false);
                     p + 1
                 }
-                (d, _) => return Err(tape_error(d, "{")),
+                _ => return Err(tape.error(p, "{")),
             };
 
             let mut cur_idx = p + 1;
             while cur_idx < end_idx {
                 let key = cur_idx;
-                let value = tape.next(key).map_err(|d| tape_error(d, "map key"))?;
-                cur_idx = tape.next(value).map_err(|d| tape_error(d, "map value"))?;
+                let value = tape.next(key, "map key")?;
+                cur_idx = tape.next(value, "map value")?;
 
                 key_pos.push(key);
                 value_pos.push(value);
diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs
index d36493a47..51bba322b 100644
--- a/arrow-json/src/reader/mod.rs
+++ b/arrow-json/src/reader/mod.rs
@@ -632,10 +632,6 @@ fn make_decoder(
     }
 }
 
-fn tape_error(d: TapeElement, expected: &str) -> ArrowError {
-    ArrowError::JsonError(format!("expected {expected} got {d}"))
-}
-
 #[cfg(test)]
 mod tests {
     use std::fs::File;
@@ -962,29 +958,29 @@ mod tests {
         let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, true)]));
 
         let buf = r#"{"a": 1}"#;
-        let result = ReaderBuilder::new(schema.clone())
+        let err = ReaderBuilder::new(schema.clone())
             .with_batch_size(1024)
             .build(Cursor::new(buf.as_bytes()))
             .unwrap()
-            .read();
+            .read()
+            .unwrap_err();
 
-        assert!(result.is_err());
         assert_eq!(
-            result.unwrap_err().to_string(),
-            "Json error: expected string got number".to_string()
+            err.to_string(),
+            "Json error: whilst decoding field 'a': expected string got 1"
         );
 
         let buf = r#"{"a": true}"#;
-        let result = ReaderBuilder::new(schema)
+        let err = ReaderBuilder::new(schema)
             .with_batch_size(1024)
             .build(Cursor::new(buf.as_bytes()))
             .unwrap()
-            .read();
+            .read()
+            .unwrap_err();
 
-        assert!(result.is_err());
         assert_eq!(
-            result.unwrap_err().to_string(),
-            "Json error: expected string got true".to_string()
+            err.to_string(),
+            "Json error: whilst decoding field 'a': expected string got true"
         );
     }
 
@@ -1866,4 +1862,95 @@ mod tests {
         assert_eq!(3, num_batches);
         assert_eq!(100000000000011, sum_a);
     }
+
+    #[test]
+    fn test_decoder_error() {
+        let schema = Arc::new(Schema::new(vec![Field::new_struct(
+            "a",
+            vec![Field::new("child", DataType::Int32, false)],
+            true,
+        )]));
+
+        let parse_err = |s: &str| {
+            ReaderBuilder::new(schema.clone())
+                .build(Cursor::new(s.as_bytes()))
+                .unwrap()
+                .next()
+                .unwrap()
+                .unwrap_err()
+                .to_string()
+        };
+
+        let err = parse_err(r#"{"a": 123}"#);
+        assert_eq!(
+            err,
+            "Json error: whilst decoding field 'a': expected { got 123"
+        );
+
+        let err = parse_err(r#"{"a": ["bar"]}"#);
+        assert_eq!(
+            err,
+            r#"Json error: whilst decoding field 'a': expected { got ["bar"]"#
+        );
+
+        let err = parse_err(r#"{"a": []}"#);
+        assert_eq!(
+            err,
+            "Json error: whilst decoding field 'a': expected { got []"
+        );
+
+        let err = parse_err(r#"{"a": [{"child": 234}]}"#);
+        assert_eq!(
+            err,
+            r#"Json error: whilst decoding field 'a': expected { got [{"child": 234}]"#
+        );
+
+        let err = parse_err(r#"{"a": [{"child": {"foo": [{"foo": ["bar"]}]}}]}"#);
+        assert_eq!(
+            err,
+            r#"Json error: whilst decoding field 'a': expected { got [{"child": {"foo": [{"foo": ["bar"]}]}}]"#
+        );
+
+        let err = parse_err(r#"{"a": true}"#);
+        assert_eq!(
+            err,
+            "Json error: whilst decoding field 'a': expected { got true"
+        );
+
+        let err = parse_err(r#"{"a": false}"#);
+        assert_eq!(
+            err,
+            "Json error: whilst decoding field 'a': expected { got false"
+        );
+
+        let err = parse_err(r#"{"a": "foo"}"#);
+        assert_eq!(
+            err,
+            "Json error: whilst decoding field 'a': expected { got \"foo\""
+        );
+
+        let err = parse_err(r#"{"a": {"child": false}}"#);
+        assert_eq!(
+            err,
+            "Json error: whilst decoding field 'a': whilst decoding field 'child': expected primitive got false"
+        );
+
+        let err = parse_err(r#"{"a": {"child": []}}"#);
+        assert_eq!(
+            err,
+            "Json error: whilst decoding field 'a': whilst decoding field 'child': expected primitive got []"
+        );
+
+        let err = parse_err(r#"{"a": {"child": [123]}}"#);
+        assert_eq!(
+            err,
+            "Json error: whilst decoding field 'a': whilst decoding field 'child': expected primitive got [123]"
+        );
+
+        let err = parse_err(r#"{"a": {"child": [123, 3465346]}}"#);
+        assert_eq!(
+            err,
+            "Json error: whilst decoding field 'a': whilst decoding field 'child': expected primitive got [123, 3465346]"
+        );
+    }
 }
diff --git a/arrow-json/src/reader/primitive_array.rs b/arrow-json/src/reader/primitive_array.rs
index 2d45d9c45..cde52391f 100644
--- a/arrow-json/src/reader/primitive_array.rs
+++ b/arrow-json/src/reader/primitive_array.rs
@@ -25,7 +25,7 @@ use arrow_data::ArrayData;
 use arrow_schema::{ArrowError, DataType};
 
 use crate::reader::tape::{Tape, TapeElement};
-use crate::reader::{tape_error, ArrayDecoder};
+use crate::reader::ArrayDecoder;
 
 /// A trait for JSON-specific primitive parsing logic
 ///
@@ -116,7 +116,7 @@ where
 
                     builder.append_value(value)
                 }
-                d => return Err(tape_error(d, "primitive")),
+                _ => return Err(tape.error(*p, "primitive")),
             }
         }
 
diff --git a/arrow-json/src/reader/string_array.rs b/arrow-json/src/reader/string_array.rs
index 8060804c9..ea9a71574 100644
--- a/arrow-json/src/reader/string_array.rs
+++ b/arrow-json/src/reader/string_array.rs
@@ -22,7 +22,7 @@ use arrow_schema::ArrowError;
 use std::marker::PhantomData;
 
 use crate::reader::tape::{Tape, TapeElement};
-use crate::reader::{tape_error, ArrayDecoder};
+use crate::reader::ArrayDecoder;
 
 const TRUE: &str = "true";
 const FALSE: &str = "false";
@@ -61,7 +61,7 @@ impl<O: OffsetSizeTrait> ArrayDecoder for StringArrayDecoder<O> {
                 TapeElement::Number(idx) if coerce_primitive => {
                     data_capacity += tape.get_string(idx).len();
                 }
-                d => return Err(tape_error(d, "string")),
+                _ => return Err(tape.error(*p, "string")),
             }
         }
 
diff --git a/arrow-json/src/reader/struct_array.rs b/arrow-json/src/reader/struct_array.rs
index 013f862c5..6c6f1457b 100644
--- a/arrow-json/src/reader/struct_array.rs
+++ b/arrow-json/src/reader/struct_array.rs
@@ -16,7 +16,7 @@
 // under the License.
 
 use crate::reader::tape::{Tape, TapeElement};
-use crate::reader::{make_decoder, tape_error, ArrayDecoder};
+use crate::reader::{make_decoder, ArrayDecoder};
 use arrow_array::builder::BooleanBufferBuilder;
 use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
 use arrow_data::{ArrayData, ArrayDataBuilder};
@@ -74,7 +74,7 @@ impl ArrayDecoder for StructArrayDecoder {
                     nulls.append(false);
                     continue;
                 }
-                (d, _) => return Err(tape_error(d, "{")),
+                _ => return Err(tape.error(*p, "{")),
             };
 
             let mut cur_idx = *p + 1;
@@ -82,7 +82,7 @@ impl ArrayDecoder for StructArrayDecoder {
                 // Read field name
                 let field_name = match tape.get(cur_idx) {
                     TapeElement::String(s) => tape.get_string(s),
-                    d => return Err(tape_error(d, "field name")),
+                    _ => return Err(tape.error(cur_idx, "field name")),
                 };
 
                 // Update child pos if match found
@@ -93,9 +93,7 @@ impl ArrayDecoder for StructArrayDecoder {
                 }
 
                 // Advance to next field
-                cur_idx = tape
-                    .next(cur_idx + 1)
-                    .map_err(|d| tape_error(d, "field value"))?;
+                cur_idx = tape.next(cur_idx + 1, "field value")?;
             }
         }
 
@@ -103,7 +101,16 @@ impl ArrayDecoder for StructArrayDecoder {
             .decoders
             .iter_mut()
             .zip(child_pos)
-            .map(|(d, pos)| d.decode(tape, &pos))
+            .zip(fields)
+            .map(|((d, pos), f)| {
+                d.decode(tape, &pos).map_err(|e| match e {
+                    ArrowError::JsonError(s) => ArrowError::JsonError(format!(
+                        "whilst decoding field '{}': {s}",
+                        f.name()
+                    )),
+                    e => e,
+                })
+            })
             .collect::<Result<Vec<_>, ArrowError>>()?;
 
         let nulls = nulls
diff --git a/arrow-json/src/reader/tape.rs b/arrow-json/src/reader/tape.rs
index 885257ed1..5eca7b43d 100644
--- a/arrow-json/src/reader/tape.rs
+++ b/arrow-json/src/reader/tape.rs
@@ -18,7 +18,6 @@
 use crate::reader::serializer::TapeSerializer;
 use arrow_schema::ArrowError;
 use serde::Serialize;
-use std::fmt::{Display, Formatter};
 
 /// We decode JSON to a flattened tape representation,
 /// allowing for efficient traversal of the JSON data
@@ -63,22 +62,6 @@ pub enum TapeElement {
     Null,
 }
 
-impl Display for TapeElement {
-    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
-        match self {
-            TapeElement::StartObject(_) => write!(f, "{{"),
-            TapeElement::EndObject(_) => write!(f, "}}"),
-            TapeElement::StartList(_) => write!(f, "["),
-            TapeElement::EndList(_) => write!(f, "]"),
-            TapeElement::String(_) => write!(f, "string"),
-            TapeElement::Number(_) => write!(f, "number"),
-            TapeElement::True => write!(f, "true"),
-            TapeElement::False => write!(f, "false"),
-            TapeElement::Null => write!(f, "null"),
-        }
-    }
-}
-
 /// A decoded JSON tape
 ///
 /// String and numeric data is stored alongside an array of [`TapeElement`]
@@ -114,9 +97,8 @@ impl<'a> Tape<'a> {
 
     /// Returns the index of the next field at the same level as `cur_idx`
     ///
-    /// Return an error containing the [`TapeElement`] at `cur_idx` if it
-    /// is not the start of a field
-    pub fn next(&self, cur_idx: u32) -> Result<u32, TapeElement> {
+    /// Return an error if `cur_idx` is not the start of a field
+    pub fn next(&self, cur_idx: u32, expected: &str) -> Result<u32, ArrowError> {
         match self.get(cur_idx) {
             TapeElement::String(_)
             | TapeElement::Number(_)
@@ -125,7 +107,7 @@ impl<'a> Tape<'a> {
             | TapeElement::Null => Ok(cur_idx + 1),
             TapeElement::StartList(end_idx) => Ok(end_idx + 1),
             TapeElement::StartObject(end_idx) => Ok(end_idx + 1),
-            d => Err(d),
+            _ => Err(self.error(cur_idx, expected)),
         }
     }
 
@@ -133,6 +115,54 @@ impl<'a> Tape<'a> {
     pub fn num_rows(&self) -> usize {
         self.num_rows
     }
+
+    /// Serialize the tape element at index `idx` to `out` returning the next field index
+    fn serialize(&self, out: &mut String, idx: u32) -> u32 {
+        match self.get(idx) {
+            TapeElement::StartObject(end) => {
+                out.push('{');
+                let mut cur_idx = idx + 1;
+                while cur_idx < end {
+                    cur_idx = self.serialize(out, cur_idx);
+                    out.push_str(": ");
+                    cur_idx = self.serialize(out, cur_idx);
+                }
+                out.push('}');
+                return end + 1;
+            }
+            TapeElement::EndObject(_) => out.push('}'),
+            TapeElement::StartList(end) => {
+                out.push('[');
+                let mut cur_idx = idx + 1;
+                while cur_idx < end {
+                    cur_idx = self.serialize(out, cur_idx);
+                    if cur_idx < end {
+                        out.push_str(", ");
+                    }
+                }
+                out.push(']');
+                return end + 1;
+            }
+            TapeElement::EndList(_) => out.push(']'),
+            TapeElement::String(s) => {
+                out.push('"');
+                out.push_str(self.get_string(s));
+                out.push('"')
+            }
+            TapeElement::Number(n) => out.push_str(self.get_string(n)),
+            TapeElement::True => out.push_str("true"),
+            TapeElement::False => out.push_str("false"),
+            TapeElement::Null => out.push_str("null"),
+        }
+        idx + 1
+    }
+
+    /// Returns an error reading index `idx`
+    pub fn error(&self, idx: u32, expected: &str) -> ArrowError {
+        let mut out = String::with_capacity(64);
+        self.serialize(&mut out, idx);
+        ArrowError::JsonError(format!("expected {expected} got {out}"))
+    }
 }
 
 /// States based on <https://www.json.org/json-en.html>
diff --git a/arrow-json/src/reader/timestamp_array.rs b/arrow-json/src/reader/timestamp_array.rs
index 73d1cda91..249613d33 100644
--- a/arrow-json/src/reader/timestamp_array.rs
+++ b/arrow-json/src/reader/timestamp_array.rs
@@ -27,7 +27,7 @@ use arrow_data::ArrayData;
 use arrow_schema::{ArrowError, DataType, TimeUnit};
 
 use crate::reader::tape::{Tape, TapeElement};
-use crate::reader::{tape_error, ArrayDecoder};
+use crate::reader::ArrayDecoder;
 
 /// A specialized [`ArrayDecoder`] for timestamps
 pub struct TimestampArrayDecoder<P: ArrowTimestampType, Tz: TimeZone> {
@@ -90,7 +90,7 @@ where
 
                     builder.append_value(value)
                 }
-                d => return Err(tape_error(d, "primitive")),
+                _ => return Err(tape.error(*p, "primitive")),
             }
         }