You are viewing a plain text version of this content. The canonical link for it is here.
Posted to github@arrow.apache.org by GitBox <gi...@apache.org> on 2022/05/27 11:09:40 UTC

[GitHub] [arrow-rs] nevi-me commented on a diff in pull request #1746: Support writing nested lists to parquet

nevi-me commented on code in PR #1746:
URL: https://github.com/apache/arrow-rs/pull/1746#discussion_r883512381


##########
parquet/src/arrow/levels.rs:
##########
@@ -825,424 +477,270 @@ mod tests {
 
     use arrow::array::*;
     use arrow::buffer::Buffer;
-    use arrow::datatypes::{Schema, ToByteSlice};
+    use arrow::datatypes::{Int32Type, Schema, ToByteSlice};
     use arrow::record_batch::RecordBatch;
+    use arrow::util::pretty::pretty_format_columns;
 
     #[test]
     fn test_calculate_array_levels_twitter_example() {
         // based on the example at https://blog.twitter.com/engineering/en_us/a/2013/dremel-made-simple-with-parquet.html
         // [[a, b, c], [d, e, f, g]], [[h], [i,j]]
-        let parent_levels = LevelInfo {
-            definition: vec![0, 0],
-            repetition: None,
-            array_offsets: vec![0, 1, 2], // 2 records, root offsets always sequential
-            array_mask: vec![true, true], // both lists defined
-            max_definition: 0,
-            level_type: LevelType::Root,
-            offset: 0,
-            length: 2,
-        };
-        // offset into array, each level1 has 2 values
-        let array_offsets = vec![0, 2, 4];
-        let array_mask = vec![true, true];
-
-        // calculate level1 levels
-        let levels = parent_levels.calculate_child_levels(
-            array_offsets.clone(),
-            array_mask,
-            LevelType::List(false),
-        );
-        //
-        let expected_levels = LevelInfo {
-            definition: vec![1, 1, 1, 1],
-            repetition: Some(vec![0, 1, 0, 1]),
-            array_offsets,
-            array_mask: vec![true, true, true, true],
-            max_definition: 1,
-            level_type: LevelType::List(false),
-            offset: 0,
-            length: 4,
-        };
-        // the separate asserts make it easier to see what's failing
-        assert_eq!(&levels.definition, &expected_levels.definition);
-        assert_eq!(&levels.repetition, &expected_levels.repetition);
-        assert_eq!(&levels.array_mask, &expected_levels.array_mask);
-        assert_eq!(&levels.array_offsets, &expected_levels.array_offsets);
-        assert_eq!(&levels.max_definition, &expected_levels.max_definition);
-        assert_eq!(&levels.level_type, &expected_levels.level_type);
-        // this assert is to help if there are more variables added to the struct
-        assert_eq!(&levels, &expected_levels);
-
-        // level2
-        let parent_levels = levels;
-        let array_offsets = vec![0, 3, 7, 8, 10];
-        let array_mask = vec![true, true, true, true];
-        let levels = parent_levels.calculate_child_levels(
-            array_offsets.clone(),
-            array_mask,
-            LevelType::List(false),
-        );
-        let expected_levels = LevelInfo {
-            definition: vec![2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
-            repetition: Some(vec![0, 2, 2, 1, 2, 2, 2, 0, 1, 2]),
-            array_offsets,
-            array_mask: vec![true; 10],
-            max_definition: 2,
-            level_type: LevelType::List(false),
-            offset: 0,
-            length: 10,
+
+        let leaf_type = Field::new("item", DataType::Int32, false);
+        let inner_type = DataType::List(Box::new(leaf_type));
+        let inner_field = Field::new("l2", inner_type.clone(), false);
+        let outer_type = DataType::List(Box::new(inner_field));
+        let outer_field = Field::new("l1", outer_type.clone(), false);
+
+        let primitives = Int32Array::from_iter(0..10);
+
+        // Cannot use from_iter_primitive as always infers nullable
+        let offsets = Buffer::from_iter([0_i32, 3, 7, 8, 10]);
+        let inner_list = ArrayDataBuilder::new(inner_type)
+            .len(4)
+            .add_buffer(offsets)
+            .add_child_data(primitives.data().clone())
+            .build()
+            .unwrap();
+
+        let offsets = Buffer::from_iter([0_i32, 2, 4]);
+        let outer_list = ArrayDataBuilder::new(outer_type)
+            .len(2)
+            .add_buffer(offsets)
+            .add_child_data(inner_list)
+            .build()
+            .unwrap();
+        let outer_list = make_array(outer_list);
+
+        let levels = calculate_array_levels(&outer_list, &outer_field).unwrap();
+        assert_eq!(levels.len(), 1);
+
+        let expected = LevelInfo {
+            def_levels: Some(vec![2; 10]),
+            rep_levels: Some(vec![0, 2, 2, 1, 2, 2, 2, 0, 1, 2]),
+            non_null_indices: vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+            max_def_level: 2,
+            max_rep_level: 2,
         };
-        assert_eq!(&levels.definition, &expected_levels.definition);
-        assert_eq!(&levels.repetition, &expected_levels.repetition);
-        assert_eq!(&levels.array_mask, &expected_levels.array_mask);
-        assert_eq!(&levels.max_definition, &expected_levels.max_definition);
-        assert_eq!(&levels.array_offsets, &expected_levels.array_offsets);
-        assert_eq!(&levels.level_type, &expected_levels.level_type);
-        assert_eq!(&levels, &expected_levels);
+        assert_eq!(&levels[0], &expected);
     }
 
     #[test]
     fn test_calculate_one_level_1() {
         // This test calculates the levels for a non-null primitive array
-        let parent_levels = LevelInfo {
-            definition: vec![0; 10],
-            repetition: None,
-            array_offsets: (0..=10).collect(),
-            array_mask: vec![true; 10],
-            max_definition: 0,
-            level_type: LevelType::Root,
-            offset: 0,
-            length: 10,
-        };
-        let array_offsets: Vec<i64> = (0..=10).collect();
-        let array_mask = vec![true; 10];
+        let array = Arc::new(Int32Array::from_iter(0..10)) as ArrayRef;
+        let field = Field::new("item", DataType::Int32, false);
+
+        let levels = calculate_array_levels(&array, &field).unwrap();
+        assert_eq!(levels.len(), 1);
 
-        let levels = parent_levels.calculate_child_levels(
-            array_offsets.clone(),
-            array_mask.clone(),
-            LevelType::Primitive(false),
-        );
         let expected_levels = LevelInfo {
-            // As it is non-null, definitions can be omitted
-            definition: vec![0; 10],
-            repetition: None,
-            array_offsets,
-            array_mask,
-            max_definition: 0,
-            level_type: LevelType::Primitive(false),
-            offset: 0,
-            length: 10,
+            def_levels: None,
+            rep_levels: None,
+            non_null_indices: (0..10).collect(),
+            max_def_level: 0,
+            max_rep_level: 0,
         };
-        assert_eq!(&levels, &expected_levels);
+        assert_eq!(&levels[0], &expected_levels);
     }
 
     #[test]
     fn test_calculate_one_level_2() {
-        // This test calculates the levels for a non-null primitive array
-        let parent_levels = LevelInfo {
-            definition: vec![0; 5],
-            repetition: None,
-            array_offsets: (0..=5).collect(),
-            array_mask: vec![true, true, true, true, true],
-            max_definition: 0,
-            level_type: LevelType::Root,
-            offset: 0,
-            length: 5,
-        };
-        let array_offsets: Vec<i64> = (0..=5).collect();
-        let array_mask = vec![true, false, true, true, false];
+        // This test calculates the levels for a nullable primitive array
+        let array = Arc::new(Int32Array::from_iter([
+            Some(0),
+            None,
+            Some(0),
+            Some(0),
+            None,
+        ])) as ArrayRef;
+        let field = Field::new("item", DataType::Int32, true);
+
+        let levels = calculate_array_levels(&array, &field).unwrap();
+        assert_eq!(levels.len(), 1);
 
-        let levels = parent_levels.calculate_child_levels(
-            array_offsets.clone(),
-            array_mask.clone(),
-            LevelType::Primitive(true),
-        );
         let expected_levels = LevelInfo {
-            definition: vec![1, 0, 1, 1, 0],
-            repetition: None,
-            array_offsets,
-            array_mask,
-            max_definition: 1,
-            level_type: LevelType::Primitive(true),
-            offset: 0,
-            length: 5,
+            def_levels: Some(vec![1, 0, 1, 1, 0]),
+            rep_levels: None,
+            non_null_indices: vec![0, 2, 3],
+            max_def_level: 1,
+            max_rep_level: 0,
         };
-        assert_eq!(&levels, &expected_levels);
+        assert_eq!(&levels[0], &expected_levels);
     }
 
     #[test]
     fn test_calculate_array_levels_1() {
+        let leaf_field = Field::new("item", DataType::Int32, false);
+        let list_type = DataType::List(Box::new(leaf_field));
+
         // if all array values are defined (e.g. batch<list<_>>)
         // [[0], [1], [2], [3], [4]]
-        let parent_levels = LevelInfo {
-            definition: vec![0; 5],
-            repetition: None,
-            array_offsets: vec![0, 1, 2, 3, 4, 5],
-            array_mask: vec![true, true, true, true, true],
-            max_definition: 0,
-            level_type: LevelType::Root,
-            offset: 0,
-            length: 5,
+
+        let leaf_array = Int32Array::from_iter(0..5);
+        // Cannot use from_iter_primitive as always infers nullable
+        let offsets = Buffer::from_iter(0_i32..6);
+        let list = ArrayDataBuilder::new(list_type.clone())
+            .len(5)
+            .add_buffer(offsets)
+            .add_child_data(leaf_array.data().clone())
+            .build()
+            .unwrap();
+        let list = make_array(list);
+
+        let list_field = Field::new("list", list_type.clone(), false);
+        let levels = calculate_array_levels(&list, &list_field).unwrap();
+        assert_eq!(levels.len(), 1);
+
+        let expected_levels = LevelInfo {
+            def_levels: Some(vec![1; 5]),
+            rep_levels: Some(vec![0; 5]),
+            non_null_indices: (0..5).collect(),
+            max_def_level: 1,
+            max_rep_level: 1,
         };
-        let array_offsets = vec![0, 2, 2, 4, 8, 11];
-        let array_mask = vec![true, false, true, true, true];
+        assert_eq!(&levels[0], &expected_levels);
 
-        let levels = parent_levels.calculate_child_levels(
-            array_offsets.clone(),
-            array_mask,
-            LevelType::List(true),
-        );
-        // array: [[0, 0], _1_, [2, 2], [3, 3, 3, 3], [4, 4, 4]]
+        // array: [[0, 0], NULL, [2, 2], [3, 3, 3, 3], [4, 4, 4]]
         // all values are defined as we do not have nulls on the root (batch)
         // repetition:
         //   0: 0, 1
-        //   1:
+        //   1: 0
         //   2: 0, 1
         //   3: 0, 1, 1, 1
         //   4: 0, 1, 1
-        let expected_levels = LevelInfo {
-            // The levels are normally 2 because we:
-            // - Calculate the level at the list
-            // - Calculate the level at the list's child
-            // We do not do this in these tests, thus the levels are 1 less.
-            definition: vec![2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2],
-            repetition: Some(vec![0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1]),
-            array_offsets,
-            array_mask: vec![
-                true, true, false, true, true, true, true, true, true, true, true, true,
-            ],
-            max_definition: 2,
-            level_type: LevelType::List(true),
-            offset: 0,
-            length: 11, // the child has 11 slots
-        };
-        assert_eq!(&levels.definition, &expected_levels.definition);
-        assert_eq!(&levels.repetition, &expected_levels.repetition);
-        assert_eq!(&levels.array_offsets, &expected_levels.array_offsets);
-        assert_eq!(&levels.max_definition, &expected_levels.max_definition);
-        assert_eq!(&levels.level_type, &expected_levels.level_type);
-        assert_eq!(&levels, &expected_levels);
-    }
+        let leaf_array = Int32Array::from_iter([0, 0, 2, 2, 3, 3, 3, 3, 4, 4, 4]);
+        let offsets = Buffer::from_iter([0_i32, 2, 2, 4, 8, 11]);
+        let list = ArrayDataBuilder::new(list_type.clone())
+            .len(5)
+            .add_buffer(offsets)
+            .add_child_data(leaf_array.data().clone())
+            .null_bit_buffer(Buffer::from([0b00011101]))
+            .build()
+            .unwrap();
+        let list = make_array(list);
 
-    #[test]
-    fn test_calculate_array_levels_2() {
-        // If some values are null
-        //
-        // This emulates an array in the form: <struct<list<?>>
-        // with values:
-        // - 0: [0, 1], but is null because of the struct
-        // - 1: []
-        // - 2: [2, 3], but is null because of the struct
-        // - 3: [4, 5, 6, 7]
-        // - 4: [8, 9, 10]
-        //
-        // If the first values of a list are null due to a parent, we have to still account for them
-        // while indexing, because they would affect the way the child is indexed
-        // i.e. in the above example, we have to know that [0, 1] has to be skipped
-        let parent_levels = LevelInfo {
-            definition: vec![0, 1, 0, 1, 1],
-            repetition: None,
-            array_offsets: vec![0, 1, 2, 3, 4, 5],
-            array_mask: vec![false, true, false, true, true],
-            max_definition: 1,
-            level_type: LevelType::Struct(true),
-            offset: 0,
-            length: 5,
-        };
-        let array_offsets = vec![0, 2, 2, 4, 8, 11];
-        let array_mask = vec![true, false, true, true, true];
+        let list_field = Field::new("list", list_type.clone(), true);
+        let levels = calculate_array_levels(&list, &list_field).unwrap();
+        assert_eq!(levels.len(), 1);
 
-        let levels = parent_levels.calculate_child_levels(
-            array_offsets.clone(),
-            array_mask,
-            LevelType::List(true),
-        );
-        let expected_levels = LevelInfo {
-            // 0 1 [2] are 0 (not defined at level 1)
-            // [2] is 1, but has 0 slots so is not populated (defined at level 1 only)
-            // 2 3 [4] are 0
-            // 4 5 6 7 [8] are 1 (defined at level 1 only)
-            // 8 9 10 [11] are 2 (defined at both levels)
-            definition: vec![0, 0, 1, 0, 0, 3, 3, 3, 3, 3, 3, 3],

Review Comment:
   What I was trying to express here was the list's values even though the struct is null at that point. Hence you see `d: [0, 0], r: [0, 1]` to say that the value has 2 slots, which are all null due to the struct being null at that point.
   
   Perhaps my oversight was not considering that this might not matter at all, and it's safer/better to do `d: [0], r: [0]`.
   What limited me a lot was my indexing logic, because given an child array of 11 values, producing fewer than 11 repetition levels would lose the offset information from the Arrow side.
   
   Looking at your solution, you have 10 values vs my 12 (11 values and an extra value to encode the offset `[2, 2]` in the list). When slicing into the child list doesn't cause issues, your solution is fine/better.
   
   I suppose the root problem is Arrow's logical null rules. You could have a 1GB record with all sorts of values, but if you wrap that against a null struct (e.g. `struct<all the values>`), then you get different behaviour:
   
   * IPC roundtrip would preserve the values
   * Writing to and reading from Parquet would drop all those null values.
   
   I was trying to preserve data, but maybe while looking at the minor details, I ended up missing the bigger picture.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscribe@arrow.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org