You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by tu...@apache.org on 2023/04/13 20:39:45 UTC
[arrow-rs] branch master updated: Improve JSON decoder errors (#4076) (#4079)
This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 0121cdfad Improve JSON decoder errors (#4076) (#4079)
0121cdfad is described below
commit 0121cdfad5207bf4e8e1c4a7c20775297e4c5ae8
Author: Raphael Taylor-Davies <17...@users.noreply.github.com>
AuthorDate: Thu Apr 13 21:39:38 2023 +0100
Improve JSON decoder errors (#4076) (#4079)
* Improve JSON decoder errors (#4076)
* Clippy
* Review feedback
---
arrow-json/src/reader/boolean_array.rs | 4 +-
arrow-json/src/reader/decimal_array.rs | 4 +-
arrow-json/src/reader/list_array.rs | 8 +--
arrow-json/src/reader/map_array.rs | 8 +--
arrow-json/src/reader/mod.rs | 115 +++++++++++++++++++++++++++----
arrow-json/src/reader/primitive_array.rs | 4 +-
arrow-json/src/reader/string_array.rs | 4 +-
arrow-json/src/reader/struct_array.rs | 21 ++++--
arrow-json/src/reader/tape.rs | 72 +++++++++++++------
arrow-json/src/reader/timestamp_array.rs | 4 +-
10 files changed, 183 insertions(+), 61 deletions(-)
diff --git a/arrow-json/src/reader/boolean_array.rs b/arrow-json/src/reader/boolean_array.rs
index 9a7f22680..9094391cd 100644
--- a/arrow-json/src/reader/boolean_array.rs
+++ b/arrow-json/src/reader/boolean_array.rs
@@ -21,7 +21,7 @@ use arrow_data::ArrayData;
use arrow_schema::ArrowError;
use crate::reader::tape::{Tape, TapeElement};
-use crate::reader::{tape_error, ArrayDecoder};
+use crate::reader::ArrayDecoder;
#[derive(Default)]
pub struct BooleanArrayDecoder {}
@@ -34,7 +34,7 @@ impl ArrayDecoder for BooleanArrayDecoder {
TapeElement::Null => builder.append_null(),
TapeElement::True => builder.append_value(true),
TapeElement::False => builder.append_value(false),
- d => return Err(tape_error(d, "boolean")),
+ _ => return Err(tape.error(*p, "boolean")),
}
}
diff --git a/arrow-json/src/reader/decimal_array.rs b/arrow-json/src/reader/decimal_array.rs
index 508409ec7..fc3c9aaa6 100644
--- a/arrow-json/src/reader/decimal_array.rs
+++ b/arrow-json/src/reader/decimal_array.rs
@@ -25,7 +25,7 @@ use arrow_data::ArrayData;
use arrow_schema::ArrowError;
use crate::reader::tape::{Tape, TapeElement};
-use crate::reader::{tape_error, ArrayDecoder};
+use crate::reader::ArrayDecoder;
pub struct DecimalArrayDecoder<D: DecimalType> {
precision: u8,
@@ -64,7 +64,7 @@ where
let value = parse_decimal::<D>(s, self.precision, self.scale)?;
builder.append_value(value)
}
- d => return Err(tape_error(d, "decimal")),
+ _ => return Err(tape.error(*p, "decimal")),
}
}
diff --git a/arrow-json/src/reader/list_array.rs b/arrow-json/src/reader/list_array.rs
index ac35f9988..aa3538bd5 100644
--- a/arrow-json/src/reader/list_array.rs
+++ b/arrow-json/src/reader/list_array.rs
@@ -16,7 +16,7 @@
// under the License.
use crate::reader::tape::{Tape, TapeElement};
-use crate::reader::{make_decoder, tape_error, ArrayDecoder};
+use crate::reader::{make_decoder, ArrayDecoder};
use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder};
use arrow_array::OffsetSizeTrait;
use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
@@ -78,7 +78,7 @@ impl<O: OffsetSizeTrait> ArrayDecoder for ListArrayDecoder<O> {
nulls.append(false);
*p + 1
}
- (d, _) => return Err(tape_error(d, "[")),
+ _ => return Err(tape.error(*p, "[")),
};
let mut cur_idx = *p + 1;
@@ -86,9 +86,7 @@ impl<O: OffsetSizeTrait> ArrayDecoder for ListArrayDecoder<O> {
child_pos.push(cur_idx);
// Advance to next field
- cur_idx = tape
- .next(cur_idx)
- .map_err(|d| tape_error(d, "list value"))?;
+ cur_idx = tape.next(cur_idx, "list value")?;
}
let offset = O::from_usize(child_pos.len()).ok_or_else(|| {
diff --git a/arrow-json/src/reader/map_array.rs b/arrow-json/src/reader/map_array.rs
index 3662e594b..5e800a0d6 100644
--- a/arrow-json/src/reader/map_array.rs
+++ b/arrow-json/src/reader/map_array.rs
@@ -16,7 +16,7 @@
// under the License.
use crate::reader::tape::{Tape, TapeElement};
-use crate::reader::{make_decoder, tape_error, ArrayDecoder};
+use crate::reader::{make_decoder, ArrayDecoder};
use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder};
use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
use arrow_buffer::ArrowNativeType;
@@ -104,14 +104,14 @@ impl ArrayDecoder for MapArrayDecoder {
nulls.append(false);
p + 1
}
- (d, _) => return Err(tape_error(d, "{")),
+ _ => return Err(tape.error(p, "{")),
};
let mut cur_idx = p + 1;
while cur_idx < end_idx {
let key = cur_idx;
- let value = tape.next(key).map_err(|d| tape_error(d, "map key"))?;
- cur_idx = tape.next(value).map_err(|d| tape_error(d, "map value"))?;
+ let value = tape.next(key, "map key")?;
+ cur_idx = tape.next(value, "map value")?;
key_pos.push(key);
value_pos.push(value);
diff --git a/arrow-json/src/reader/mod.rs b/arrow-json/src/reader/mod.rs
index d36493a47..51bba322b 100644
--- a/arrow-json/src/reader/mod.rs
+++ b/arrow-json/src/reader/mod.rs
@@ -632,10 +632,6 @@ fn make_decoder(
}
}
-fn tape_error(d: TapeElement, expected: &str) -> ArrowError {
- ArrowError::JsonError(format!("expected {expected} got {d}"))
-}
-
#[cfg(test)]
mod tests {
use std::fs::File;
@@ -962,29 +958,29 @@ mod tests {
let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, true)]));
let buf = r#"{"a": 1}"#;
- let result = ReaderBuilder::new(schema.clone())
+ let err = ReaderBuilder::new(schema.clone())
.with_batch_size(1024)
.build(Cursor::new(buf.as_bytes()))
.unwrap()
- .read();
+ .read()
+ .unwrap_err();
- assert!(result.is_err());
assert_eq!(
- result.unwrap_err().to_string(),
- "Json error: expected string got number".to_string()
+ err.to_string(),
+ "Json error: whilst decoding field 'a': expected string got 1"
);
let buf = r#"{"a": true}"#;
- let result = ReaderBuilder::new(schema)
+ let err = ReaderBuilder::new(schema)
.with_batch_size(1024)
.build(Cursor::new(buf.as_bytes()))
.unwrap()
- .read();
+ .read()
+ .unwrap_err();
- assert!(result.is_err());
assert_eq!(
- result.unwrap_err().to_string(),
- "Json error: expected string got true".to_string()
+ err.to_string(),
+ "Json error: whilst decoding field 'a': expected string got true"
);
}
@@ -1866,4 +1862,95 @@ mod tests {
assert_eq!(3, num_batches);
assert_eq!(100000000000011, sum_a);
}
+
+ #[test]
+ fn test_decoder_error() {
+ let schema = Arc::new(Schema::new(vec![Field::new_struct(
+ "a",
+ vec![Field::new("child", DataType::Int32, false)],
+ true,
+ )]));
+
+ let parse_err = |s: &str| {
+ ReaderBuilder::new(schema.clone())
+ .build(Cursor::new(s.as_bytes()))
+ .unwrap()
+ .next()
+ .unwrap()
+ .unwrap_err()
+ .to_string()
+ };
+
+ let err = parse_err(r#"{"a": 123}"#);
+ assert_eq!(
+ err,
+ "Json error: whilst decoding field 'a': expected { got 123"
+ );
+
+ let err = parse_err(r#"{"a": ["bar"]}"#);
+ assert_eq!(
+ err,
+ r#"Json error: whilst decoding field 'a': expected { got ["bar"]"#
+ );
+
+ let err = parse_err(r#"{"a": []}"#);
+ assert_eq!(
+ err,
+ "Json error: whilst decoding field 'a': expected { got []"
+ );
+
+ let err = parse_err(r#"{"a": [{"child": 234}]}"#);
+ assert_eq!(
+ err,
+ r#"Json error: whilst decoding field 'a': expected { got [{"child": 234}]"#
+ );
+
+ let err = parse_err(r#"{"a": [{"child": {"foo": [{"foo": ["bar"]}]}}]}"#);
+ assert_eq!(
+ err,
+ r#"Json error: whilst decoding field 'a': expected { got [{"child": {"foo": [{"foo": ["bar"]}]}}]"#
+ );
+
+ let err = parse_err(r#"{"a": true}"#);
+ assert_eq!(
+ err,
+ "Json error: whilst decoding field 'a': expected { got true"
+ );
+
+ let err = parse_err(r#"{"a": false}"#);
+ assert_eq!(
+ err,
+ "Json error: whilst decoding field 'a': expected { got false"
+ );
+
+ let err = parse_err(r#"{"a": "foo"}"#);
+ assert_eq!(
+ err,
+ "Json error: whilst decoding field 'a': expected { got \"foo\""
+ );
+
+ let err = parse_err(r#"{"a": {"child": false}}"#);
+ assert_eq!(
+ err,
+ "Json error: whilst decoding field 'a': whilst decoding field 'child': expected primitive got false"
+ );
+
+ let err = parse_err(r#"{"a": {"child": []}}"#);
+ assert_eq!(
+ err,
+ "Json error: whilst decoding field 'a': whilst decoding field 'child': expected primitive got []"
+ );
+
+ let err = parse_err(r#"{"a": {"child": [123]}}"#);
+ assert_eq!(
+ err,
+ "Json error: whilst decoding field 'a': whilst decoding field 'child': expected primitive got [123]"
+ );
+
+ let err = parse_err(r#"{"a": {"child": [123, 3465346]}}"#);
+ assert_eq!(
+ err,
+ "Json error: whilst decoding field 'a': whilst decoding field 'child': expected primitive got [123, 3465346]"
+ );
+ }
}
diff --git a/arrow-json/src/reader/primitive_array.rs b/arrow-json/src/reader/primitive_array.rs
index 2d45d9c45..cde52391f 100644
--- a/arrow-json/src/reader/primitive_array.rs
+++ b/arrow-json/src/reader/primitive_array.rs
@@ -25,7 +25,7 @@ use arrow_data::ArrayData;
use arrow_schema::{ArrowError, DataType};
use crate::reader::tape::{Tape, TapeElement};
-use crate::reader::{tape_error, ArrayDecoder};
+use crate::reader::ArrayDecoder;
/// A trait for JSON-specific primitive parsing logic
///
@@ -116,7 +116,7 @@ where
builder.append_value(value)
}
- d => return Err(tape_error(d, "primitive")),
+ _ => return Err(tape.error(*p, "primitive")),
}
}
diff --git a/arrow-json/src/reader/string_array.rs b/arrow-json/src/reader/string_array.rs
index 8060804c9..ea9a71574 100644
--- a/arrow-json/src/reader/string_array.rs
+++ b/arrow-json/src/reader/string_array.rs
@@ -22,7 +22,7 @@ use arrow_schema::ArrowError;
use std::marker::PhantomData;
use crate::reader::tape::{Tape, TapeElement};
-use crate::reader::{tape_error, ArrayDecoder};
+use crate::reader::ArrayDecoder;
const TRUE: &str = "true";
const FALSE: &str = "false";
@@ -61,7 +61,7 @@ impl<O: OffsetSizeTrait> ArrayDecoder for StringArrayDecoder<O> {
TapeElement::Number(idx) if coerce_primitive => {
data_capacity += tape.get_string(idx).len();
}
- d => return Err(tape_error(d, "string")),
+ _ => return Err(tape.error(*p, "string")),
}
}
diff --git a/arrow-json/src/reader/struct_array.rs b/arrow-json/src/reader/struct_array.rs
index 013f862c5..6c6f1457b 100644
--- a/arrow-json/src/reader/struct_array.rs
+++ b/arrow-json/src/reader/struct_array.rs
@@ -16,7 +16,7 @@
// under the License.
use crate::reader::tape::{Tape, TapeElement};
-use crate::reader::{make_decoder, tape_error, ArrayDecoder};
+use crate::reader::{make_decoder, ArrayDecoder};
use arrow_array::builder::BooleanBufferBuilder;
use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
use arrow_data::{ArrayData, ArrayDataBuilder};
@@ -74,7 +74,7 @@ impl ArrayDecoder for StructArrayDecoder {
nulls.append(false);
continue;
}
- (d, _) => return Err(tape_error(d, "{")),
+ _ => return Err(tape.error(*p, "{")),
};
let mut cur_idx = *p + 1;
@@ -82,7 +82,7 @@ impl ArrayDecoder for StructArrayDecoder {
// Read field name
let field_name = match tape.get(cur_idx) {
TapeElement::String(s) => tape.get_string(s),
- d => return Err(tape_error(d, "field name")),
+ _ => return Err(tape.error(cur_idx, "field name")),
};
// Update child pos if match found
@@ -93,9 +93,7 @@ impl ArrayDecoder for StructArrayDecoder {
}
// Advance to next field
- cur_idx = tape
- .next(cur_idx + 1)
- .map_err(|d| tape_error(d, "field value"))?;
+ cur_idx = tape.next(cur_idx + 1, "field value")?;
}
}
@@ -103,7 +101,16 @@ impl ArrayDecoder for StructArrayDecoder {
.decoders
.iter_mut()
.zip(child_pos)
- .map(|(d, pos)| d.decode(tape, &pos))
+ .zip(fields)
+ .map(|((d, pos), f)| {
+ d.decode(tape, &pos).map_err(|e| match e {
+ ArrowError::JsonError(s) => ArrowError::JsonError(format!(
+ "whilst decoding field '{}': {s}",
+ f.name()
+ )),
+ e => e,
+ })
+ })
.collect::<Result<Vec<_>, ArrowError>>()?;
let nulls = nulls
diff --git a/arrow-json/src/reader/tape.rs b/arrow-json/src/reader/tape.rs
index 885257ed1..5eca7b43d 100644
--- a/arrow-json/src/reader/tape.rs
+++ b/arrow-json/src/reader/tape.rs
@@ -18,7 +18,6 @@
use crate::reader::serializer::TapeSerializer;
use arrow_schema::ArrowError;
use serde::Serialize;
-use std::fmt::{Display, Formatter};
/// We decode JSON to a flattened tape representation,
/// allowing for efficient traversal of the JSON data
@@ -63,22 +62,6 @@ pub enum TapeElement {
Null,
}
-impl Display for TapeElement {
- fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
- match self {
- TapeElement::StartObject(_) => write!(f, "{{"),
- TapeElement::EndObject(_) => write!(f, "}}"),
- TapeElement::StartList(_) => write!(f, "["),
- TapeElement::EndList(_) => write!(f, "]"),
- TapeElement::String(_) => write!(f, "string"),
- TapeElement::Number(_) => write!(f, "number"),
- TapeElement::True => write!(f, "true"),
- TapeElement::False => write!(f, "false"),
- TapeElement::Null => write!(f, "null"),
- }
- }
-}
-
/// A decoded JSON tape
///
/// String and numeric data is stored alongside an array of [`TapeElement`]
@@ -114,9 +97,8 @@ impl<'a> Tape<'a> {
/// Returns the index of the next field at the same level as `cur_idx`
///
- /// Return an error containing the [`TapeElement`] at `cur_idx` if it
- /// is not the start of a field
- pub fn next(&self, cur_idx: u32) -> Result<u32, TapeElement> {
+ /// Return an error if `cur_idx` is not the start of a field
+ pub fn next(&self, cur_idx: u32, expected: &str) -> Result<u32, ArrowError> {
match self.get(cur_idx) {
TapeElement::String(_)
| TapeElement::Number(_)
@@ -125,7 +107,7 @@ impl<'a> Tape<'a> {
| TapeElement::Null => Ok(cur_idx + 1),
TapeElement::StartList(end_idx) => Ok(end_idx + 1),
TapeElement::StartObject(end_idx) => Ok(end_idx + 1),
- d => Err(d),
+ _ => Err(self.error(cur_idx, expected)),
}
}
@@ -133,6 +115,54 @@ impl<'a> Tape<'a> {
pub fn num_rows(&self) -> usize {
self.num_rows
}
+
+ /// Serialize the tape element at index `idx` to `out` returning the next field index
+ fn serialize(&self, out: &mut String, idx: u32) -> u32 {
+ match self.get(idx) {
+ TapeElement::StartObject(end) => {
+ out.push('{');
+ let mut cur_idx = idx + 1;
+ while cur_idx < end {
+ cur_idx = self.serialize(out, cur_idx);
+ out.push_str(": ");
+ cur_idx = self.serialize(out, cur_idx);
+ }
+ out.push('}');
+ return end + 1;
+ }
+ TapeElement::EndObject(_) => out.push('}'),
+ TapeElement::StartList(end) => {
+ out.push('[');
+ let mut cur_idx = idx + 1;
+ while cur_idx < end {
+ cur_idx = self.serialize(out, cur_idx);
+ if cur_idx < end {
+ out.push_str(", ");
+ }
+ }
+ out.push(']');
+ return end + 1;
+ }
+ TapeElement::EndList(_) => out.push(']'),
+ TapeElement::String(s) => {
+ out.push('"');
+ out.push_str(self.get_string(s));
+ out.push('"')
+ }
+ TapeElement::Number(n) => out.push_str(self.get_string(n)),
+ TapeElement::True => out.push_str("true"),
+ TapeElement::False => out.push_str("false"),
+ TapeElement::Null => out.push_str("null"),
+ }
+ idx + 1
+ }
+
+ /// Returns an error reading index `idx`
+ pub fn error(&self, idx: u32, expected: &str) -> ArrowError {
+ let mut out = String::with_capacity(64);
+ self.serialize(&mut out, idx);
+ ArrowError::JsonError(format!("expected {expected} got {out}"))
+ }
}
/// States based on <https://www.json.org/json-en.html>
diff --git a/arrow-json/src/reader/timestamp_array.rs b/arrow-json/src/reader/timestamp_array.rs
index 73d1cda91..249613d33 100644
--- a/arrow-json/src/reader/timestamp_array.rs
+++ b/arrow-json/src/reader/timestamp_array.rs
@@ -27,7 +27,7 @@ use arrow_data::ArrayData;
use arrow_schema::{ArrowError, DataType, TimeUnit};
use crate::reader::tape::{Tape, TapeElement};
-use crate::reader::{tape_error, ArrayDecoder};
+use crate::reader::ArrayDecoder;
/// A specialized [`ArrayDecoder`] for timestamps
pub struct TimestampArrayDecoder<P: ArrowTimestampType, Tz: TimeZone> {
@@ -90,7 +90,7 @@ where
builder.append_value(value)
}
- d => return Err(tape_error(d, "primitive")),
+ _ => return Err(tape.error(*p, "primitive")),
}
}