You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by go...@apache.org on 2021/08/27 22:37:01 UTC

[iceberg] branch master updated: [Python] Support schema field ID validation in python (#2866)

This is an automated email from the ASF dual-hosted git repository.

gooch pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git


The following commit(s) were added to refs/heads/master by this push:
     new 7d6f692  [Python] Support schema field ID validation in python (#2866)
7d6f692 is described below

commit 7d6f692937a939ffccb8fa997a91bd49f616eab6
Author: jun-he <ju...@users.noreply.github.com>
AuthorDate: Fri Aug 27 15:36:50 2021 -0700

    [Python] Support schema field ID validation in python (#2866)
    
    * Support schema field ID validation in python. Also fix bugs in type_util and add unit tests.
    
    * Add license to the test file
    
    * update unit tests
    
    * address the comments
    
    * only ignore NotImplementedError error during visit.
---
 python/iceberg/api/schema.py             |  8 +++---
 python/iceberg/api/types/type_util.py    | 15 +++++++---
 python/tests/api/types/test_type_util.py | 49 ++++++++++++++++++++++++++++++++
 3 files changed, 64 insertions(+), 8 deletions(-)

diff --git a/python/iceberg/api/schema.py b/python/iceberg/api/schema.py
index 0d70bf8..50d99e7 100644
--- a/python/iceberg/api/schema.py
+++ b/python/iceberg/api/schema.py
@@ -16,7 +16,7 @@
 # under the License.
 
 from .types import StructType
-
+from .types import type_util
 """
   The schema of a data table.
 
@@ -45,9 +45,9 @@ class Schema(object):
             self._id_to_alias = {v: k for k, v in self._alias_to_id.items()}
 
         self._id_to_field = None
-        self._name_to_id = None
-        self._lowercase_name_to_id = None
-        self._id_to_name = None
+        self._name_to_id = type_util.index_by_name(self.struct)
+        self._id_to_name = {v: k for k, v in self._name_to_id.items()}
+        self._lowercase_name_to_id = {k.lower(): v for k, v in self._name_to_id.items()}
 
     def as_struct(self):
         return self.struct
diff --git a/python/iceberg/api/types/type_util.py b/python/iceberg/api/types/type_util.py
index f7f8b13..5a465e5 100644
--- a/python/iceberg/api/types/type_util.py
+++ b/python/iceberg/api/types/type_util.py
@@ -24,6 +24,7 @@ from .types import (ListType,
                     MapType,
                     NestedField,
                     StructType)
+from ...exceptions import ValidationException
 
 MAX_PRECISION = list()
 REQUIRED_LENGTH = [-1 for item in range(40)]
@@ -121,7 +122,8 @@ def visit(arg, visitor): # noqa: ignore=C901
                 result = None
                 try:
                     result = visit(field.type, visitor)
-                except RuntimeError:
+                except NotImplementedError:
+                    # will remove it after missing functions are implemented.
                     pass
                 finally:
                     visitor.field_ids.pop()
@@ -133,7 +135,8 @@ def visit(arg, visitor): # noqa: ignore=C901
             visitor.field_ids.append(list_var.element_id)
             try:
                 element_result = visit(list_var.element_type, visitor)
-            except RuntimeError:
+            except NotImplementedError:
+                # will remove it after missing functions are implemented.
                 pass
             finally:
                 visitor.field_ids.pop()
@@ -352,8 +355,12 @@ class IndexByName(SchemaVisitor):
 
     def add_field(self, name, field_id):
         full_name = name
-        if not self.field_names and len(self.field_names) > 0:
-            full_name = IndexByName.DOT.join([IndexByName.DOT.join(reversed(self.field_names)), name])
+        if self.field_names is not None and len(self.field_names) > 0:
+            full_name = IndexByName.DOT.join([IndexByName.DOT.join(self.field_names), name])
+
+        existing_field_id = self.name_to_id.get(full_name)
+        ValidationException.check(existing_field_id is None, "Invalid schema: multiple fields for name %s: %s and %s",
+                                  (full_name, existing_field_id, field_id))
 
         self.name_to_id[full_name] = field_id
 
diff --git a/python/tests/api/types/test_type_util.py b/python/tests/api/types/test_type_util.py
new file mode 100644
index 0000000..5803603
--- /dev/null
+++ b/python/tests/api/types/test_type_util.py
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from iceberg.api.types import (BooleanType, NestedField, StructType)
+from iceberg.api.types import type_util
+from iceberg.exceptions import ValidationException
+import pytest
+
+
+def test_invalid_schema_via_index_by_name():
+    bool_type1 = NestedField.required(1, "a", BooleanType.get())
+    bool_type2 = NestedField.required(2, "a", BooleanType.get())
+
+    with pytest.raises(ValidationException) as context:
+        type_util.index_by_name(StructType.of([bool_type1, bool_type2]))
+    assert str(context.value) == 'Invalid schema: multiple fields for name a: 1 and 2'
+
+
+def test_valid_schema_via_index_by_name():
+    bool_type1 = NestedField.required(1, "a", BooleanType.get())
+    bool_type2 = NestedField.required(2, "b", BooleanType.get())
+
+    assert {'a': 1, 'b': 2} == type_util.index_by_name(StructType.of([bool_type1, bool_type2]))
+
+
+def test_validate_schema_via_index_by_name_for_nested_type():
+    nested_type = NestedField.required(
+        1, "a", StructType.of(
+            [NestedField.required(2, "b", StructType.of(
+                [NestedField.required(3, "c", BooleanType.get())])),
+             NestedField.required(4, "b.c", BooleanType.get())]))
+
+    with pytest.raises(ValidationException) as context:
+        type_util.index_by_name(StructType.of([nested_type]))
+    assert str(context.value) == 'Invalid schema: multiple fields for name a.b.c: 3 and 4'