You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/01/13 19:18:27 UTC

[GitHub] [arrow-rs] helgikrs commented on a change in pull request #1166: Bugfix in parquet writing empty lists of structs

helgikrs commented on a change in pull request #1166:
URL: https://github.com/apache/arrow-rs/pull/1166#discussion_r784256363



##########
File path: parquet/src/arrow/levels.rs
##########
@@ -1675,4 +1682,95 @@ mod tests {
         };
         assert_eq!(list_level, &expected_level);
     }
+
+    #[test]
+    fn test_list_of_struct() {
+        // define schema
+        let int_field = Field::new("a", DataType::Int32, true);
+        let item_field =
+            Field::new("item", DataType::Struct(vec![int_field.clone()]), true);
+        let list_field = Field::new("list", DataType::List(Box::new(item_field)), true);
+
+        let int_builder = Int32Builder::new(10);
+        let struct_builder =
+            StructBuilder::new(vec![int_field], vec![Box::new(int_builder)]);
+        let mut list_builder = ListBuilder::new(struct_builder);
+
+        // [{a: 1}], [], null, [null, null], [{a: null}], [{a: 2}]
+        //
+        // [{a: 1}]
+        let values = list_builder.values();
+        values
+            .field_builder::<Int32Builder>(0)
+            .unwrap()
+            .append_value(1)
+            .unwrap();
+        values.append(true).unwrap();
+        list_builder.append(true).unwrap();
+
+        // []
+        list_builder.append(true).unwrap();
+
+        // null
+        list_builder.append(false).unwrap();
+
+        // [null, null]
+        let values = list_builder.values();
+        values
+            .field_builder::<Int32Builder>(0)
+            .unwrap()
+            .append_null()
+            .unwrap();
+        values.append(false).unwrap();
+        values
+            .field_builder::<Int32Builder>(0)
+            .unwrap()
+            .append_null()
+            .unwrap();
+        values.append(false).unwrap();
+        list_builder.append(true).unwrap();
+
+        // [{a: null}]
+        let values = list_builder.values();
+        values
+            .field_builder::<Int32Builder>(0)
+            .unwrap()
+            .append_null()
+            .unwrap();
+        values.append(true).unwrap();
+        list_builder.append(true).unwrap();
+
+        // [{a: 2}]
+        let values = list_builder.values();
+        values
+            .field_builder::<Int32Builder>(0)
+            .unwrap()
+            .append_value(2)
+            .unwrap();
+        values.append(true).unwrap();
+        list_builder.append(true).unwrap();
+
+        let array = Arc::new(list_builder.finish());
+
+        let schema = Arc::new(Schema::new(vec![list_field]));
+
+        let rb = RecordBatch::try_new(schema, vec![array]).unwrap();
+
+        let batch_level = LevelInfo::new(0, rb.num_rows());
+        let list_level =
+            &batch_level.calculate_array_levels(rb.column(0), rb.schema().field(0))[0];
+
+        let expected_level = LevelInfo {

Review comment:
       I'm not super confident in this either--it would be great if someone with knowledge about the details of this code could chime in.
   
   The definition and repetition levels I compared with what the c++ parquet writer produces. I exported the above record batch and used the C++ parquet writer to generate a parquet file. I then used `parquet-dump` on the resulting file, which produced the following
   ```
   value 1: R:0 D:4 V:1
   value 2: R:0 D:1 V:<null>
   value 3: R:0 D:0 V:<null>
   value 4: R:0 D:2 V:<null>
   value 5: R:1 D:2 V:<null>
   value 6: R:0 D:3 V:<null>
   value 7: R:0 D:4 V:2
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org