You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by fo...@apache.org on 2022/09/20 18:07:43 UTC
[iceberg] branch master updated: Python: Handle optional Avro fields in conversion. (#5796)

This is an automated email from the ASF dual-hosted git repository.

fokko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git


The following commit(s) were added to refs/heads/master by this push:
     new 53e056a277 Python: Handle optional Avro fields in conversion. (#5796)
53e056a277 is described below

commit 53e056a2771fd8aa6b88d6e21e0586c1b6a92ac8
Author: Joshua Robinson <33...@users.noreply.github.com>
AuthorDate: Tue Sep 20 20:07:36 2022 +0200

    Python: Handle optional Avro fields in conversion. (#5796)
    
    Found by processing fields in manifestentry with empty split_offsets field.
    
    For pos_to_dict, check if values is None before processing as list,
    struct, or dict. Added unit tests to verify.
    
    Thanks to @fokko for the fix.
---
 python/pyiceberg/manifest.py     | 20 +++++++++++------
 python/tests/avro/test_reader.py | 46 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 6 deletions(-)

diff --git a/python/pyiceberg/manifest.py b/python/pyiceberg/manifest.py
index 751dfbef2a..8619d18abf 100644
--- a/python/pyiceberg/manifest.py
+++ b/python/pyiceberg/manifest.py
@@ -173,22 +173,30 @@ def _(schema: Schema, struct: AvroStruct) -> Dict[str, Any]:
 @_convert_pos_to_dict.register
 def _(struct_type: StructType, values: AvroStruct) -> Dict[str, Any]:
     """Iterates over all the fields in the dict, and gets the data from the struct"""
-    return {field.name: _convert_pos_to_dict(field.field_type, values.get(pos)) for pos, field in enumerate(struct_type.fields)}
+    return (
+        {field.name: _convert_pos_to_dict(field.field_type, values.get(pos)) for pos, field in enumerate(struct_type.fields)}
+        if values is not None
+        else None
+    )
 
 
 @_convert_pos_to_dict.register
 def _(list_type: ListType, values: List[Any]) -> Any:
     """In the case of a list, we'll go over the elements in the list to handle complex types"""
-    return [_convert_pos_to_dict(list_type.element_type, value) for value in values]
+    return [_convert_pos_to_dict(list_type.element_type, value) for value in values] if values is not None else None
 
 
 @_convert_pos_to_dict.register
 def _(map_type: MapType, values: Dict) -> Dict:
     """In the case of a map, we both traverse over the key and value to handle complex types"""
-    return {
-        _convert_pos_to_dict(map_type.key_type, key): _convert_pos_to_dict(map_type.value_type, value)
-        for key, value in values.items()
-    }
+    return (
+        {
+            _convert_pos_to_dict(map_type.key_type, key): _convert_pos_to_dict(map_type.value_type, value)
+            for key, value in values.items()
+        }
+        if values is not None
+        else None
+    )
 
 
 @_convert_pos_to_dict.register
diff --git a/python/tests/avro/test_reader.py b/python/tests/avro/test_reader.py
index 44505a6174..77e186b4f7 100644
--- a/python/tests/avro/test_reader.py
+++ b/python/tests/avro/test_reader.py
@@ -36,6 +36,7 @@ from pyiceberg.avro.reader import (
     TimestamptzReader,
     primitive_reader,
 )
+from pyiceberg.manifest import _convert_pos_to_dict
 from pyiceberg.schema import Schema
 from pyiceberg.types import (
     BinaryType,
@@ -46,9 +47,13 @@ from pyiceberg.types import (
     FixedType,
     FloatType,
     IntegerType,
+    ListType,
     LongType,
+    MapType,
+    NestedField,
     PrimitiveType,
     StringType,
+    StructType,
     TimestampType,
     TimestamptzType,
     TimeType,
@@ -395,6 +400,47 @@ def test_read_manifest_file_file(generated_manifest_file_file: str):
     assert actual == expected
 
 
+def test_null_list_convert_pos_to_dict():
+    data = _convert_pos_to_dict(
+        Schema(
+            NestedField(name="field", field_id=1, field_type=ListType(element_id=2, element=StringType(), element_required=False))
+        ),
+        AvroStruct([None]),
+    )
+    assert data["field"] is None
+
+
+def test_null_dict_convert_pos_to_dict():
+    data = _convert_pos_to_dict(
+        Schema(
+            NestedField(
+                name="field",
+                field_id=1,
+                field_type=MapType(key_id=2, key_type=StringType(), value_id=3, value_type=StringType(), value_required=False),
+            )
+        ),
+        AvroStruct([None]),
+    )
+    assert data["field"] is None
+
+
+def test_null_struct_convert_pos_to_dict():
+    data = _convert_pos_to_dict(
+        Schema(
+            NestedField(
+                name="field",
+                field_id=1,
+                field_type=StructType(
+                    NestedField(2, "required_field", StringType(), True), NestedField(3, "optional_field", IntegerType())
+                ),
+                required=False,
+            )
+        ),
+        AvroStruct([None]),
+    )
+    assert data["field"] is None
+
+
 def test_fixed_reader():
     assert primitive_reader(FixedType(22)) == FixedReader(22)