You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by "alamb (via GitHub)" <gi...@apache.org> on 2023/03/24 11:08:32 UTC

[GitHub] [arrow-rs] alamb commented on a diff in pull request #3906: Enforce struct nullability in JSON raw reader (#3900) (#3904)

alamb commented on code in PR #3906:
URL: https://github.com/apache/arrow-rs/pull/3906#discussion_r1147419834


##########
arrow-array/src/array/boolean_array.rs:
##########
@@ -105,15 +105,10 @@ impl BooleanArray {
     pub fn true_count(&self) -> usize {
         match self.data.nulls() {
             Some(nulls) => {
-                let null_chunks = nulls.inner().bit_chunks();
-                let value_chunks = self.values().bit_chunks();
+                let null_chunks = nulls.inner().bit_chunks().iter_padded();
+                let value_chunks = self.values().bit_chunks().iter_padded();
                 null_chunks
-                    .iter()
-                    .zip(value_chunks.iter())
-                    .chain(std::iter::once((
-                        null_chunks.remainder_bits(),
-                        value_chunks.remainder_bits(),
-                    )))
+                    .zip(value_chunks)

Review Comment:
   drive by consolidation it look like 👍 



##########
arrow-json/src/raw/mod.rs:
##########
@@ -1004,4 +1004,84 @@ mod tests {
         test_time::<Time64MicrosecondType>();
         test_time::<Time64NanosecondType>();
     }
+
+    #[test]
+    fn test_delta_checkpoint() {
+        let json = "{\"protocol\":{\"minReaderVersion\":1,\"minWriterVersion\":2}}";
+        let schema = Arc::new(Schema::new(vec![
+            Field::new(
+                "protocol",
+                DataType::Struct(vec![
+                    Field::new("minReaderVersion", DataType::Int32, true),
+                    Field::new("minWriterVersion", DataType::Int32, true),
+                ]),
+                true,
+            ),
+            Field::new(
+                "add",
+                DataType::Struct(vec![Field::new(
+                    "partitionValues",
+                    DataType::Map(
+                        Box::new(Field::new(
+                            "key_value",
+                            DataType::Struct(vec![
+                                Field::new("key", DataType::Utf8, false),
+                                Field::new("value", DataType::Utf8, true),
+                            ]),
+                            false,
+                        )),
+                        false,
+                    ),
+                    false,
+                )]),
+                true,
+            ),
+        ]));
+
+        let batches = do_read(json, 1024, true, schema);
+        assert_eq!(batches.len(), 1);

Review Comment:
   I wonder if asserting the contents with `assert_batches_eq` or similar would be helpful here 
   
   Also would it help to verify that the key is not null is enforced. Like that this document errors` { add: { partition_values: [ foo: bar, null: baz ] } } `?



##########
arrow-json/src/raw/mod.rs:
##########
@@ -514,8 +514,8 @@ mod tests {
             Field::new(
                 "nested",
                 DataType::Struct(vec![
-                    Field::new("a", DataType::Int32, false),
-                    Field::new("b", DataType::Int32, false),
+                    Field::new("a", DataType::Int32, true),

Review Comment:
   This is because
   
   ```
              {"list": null, "nested": {"a": null}}
   ```
   
   has a `null` for `nested:a`, right? So clearly the field needs to be nullable



##########
arrow-json/src/raw/mod.rs:
##########
@@ -594,7 +594,7 @@ mod tests {
                     "list2",
                     DataType::List(Box::new(Field::new(
                         "element",
-                        DataType::Struct(vec![Field::new("d", DataType::Int32, false)]),
+                        DataType::Struct(vec![Field::new("d", DataType::Int32, true)]),

Review Comment:
   Is the issue that in
   ```
              {"list": [], "nested": {"a": 1, "b": 2}, "nested_list": {"list2": [{"c": 3, "d": 5}, {"c": 4}]}}
   ```
   
   The second list element, `{"c": 4}`,  has an implict null in `d`?



##########
arrow-json/src/raw/mod.rs:
##########
@@ -1004,4 +1004,84 @@ mod tests {
         test_time::<Time64MicrosecondType>();
         test_time::<Time64NanosecondType>();
     }
+
+    #[test]
+    fn test_delta_checkpoint() {
+        let json = "{\"protocol\":{\"minReaderVersion\":1,\"minWriterVersion\":2}}";
+        let schema = Arc::new(Schema::new(vec![
+            Field::new(
+                "protocol",
+                DataType::Struct(vec![
+                    Field::new("minReaderVersion", DataType::Int32, true),
+                    Field::new("minWriterVersion", DataType::Int32, true),
+                ]),
+                true,
+            ),
+            Field::new(
+                "add",
+                DataType::Struct(vec![Field::new(
+                    "partitionValues",
+                    DataType::Map(
+                        Box::new(Field::new(
+                            "key_value",
+                            DataType::Struct(vec![
+                                Field::new("key", DataType::Utf8, false),
+                                Field::new("value", DataType::Utf8, true),
+                            ]),
+                            false,
+                        )),
+                        false,
+                    ),
+                    false,
+                )]),
+                true,
+            ),
+        ]));
+
+        let batches = do_read(json, 1024, true, schema);
+        assert_eq!(batches.len(), 1);
+    }
+
+    #[test]
+    fn struct_nullability() {
+        let do_test = |child: DataType| {
+            // Test correctly enforced nullability
+            let non_null = r#"{"foo": {}}"#;
+            let schema = Arc::new(Schema::new(vec![Field::new(
+                "foo",
+                DataType::Struct(vec![Field::new("bar", child, false)]),
+                true,
+            )]));
+            let mut reader = RawReaderBuilder::new(schema.clone())
+                .build(Cursor::new(non_null.as_bytes()))
+                .unwrap();
+            assert!(reader.next().unwrap().is_err()); // Should error as not nullable
+
+            // Test nulls in nullable parent can mask nulls in non-nullable child

Review Comment:
   Should we also verify that this case errors too?
   
   ```
               let null = r#"{"foo": {bar: null}}"#;



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org