You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by go...@apache.org on 2021/08/27 22:37:01 UTC
[iceberg] branch master updated: [Python] Support schema field ID
validation in python (#2866)
This is an automated email from the ASF dual-hosted git repository.
gooch pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new 7d6f692 [Python] Support schema field ID validation in python (#2866)
7d6f692 is described below
commit 7d6f692937a939ffccb8fa997a91bd49f616eab6
Author: jun-he <ju...@users.noreply.github.com>
AuthorDate: Fri Aug 27 15:36:50 2021 -0700
[Python] Support schema field ID validation in python (#2866)
* Support schema field ID validation in python. Also fix bugs in type_util and add unit tests.
* Add license to the test file
* update unit tests
* address the comments
* only ignore NotImplementedError error during visit.
---
python/iceberg/api/schema.py | 8 +++---
python/iceberg/api/types/type_util.py | 15 +++++++---
python/tests/api/types/test_type_util.py | 49 ++++++++++++++++++++++++++++++++
3 files changed, 64 insertions(+), 8 deletions(-)
diff --git a/python/iceberg/api/schema.py b/python/iceberg/api/schema.py
index 0d70bf8..50d99e7 100644
--- a/python/iceberg/api/schema.py
+++ b/python/iceberg/api/schema.py
@@ -16,7 +16,7 @@
# under the License.
from .types import StructType
-
+from .types import type_util
"""
The schema of a data table.
@@ -45,9 +45,9 @@ class Schema(object):
self._id_to_alias = {v: k for k, v in self._alias_to_id.items()}
self._id_to_field = None
- self._name_to_id = None
- self._lowercase_name_to_id = None
- self._id_to_name = None
+ self._name_to_id = type_util.index_by_name(self.struct)
+ self._id_to_name = {v: k for k, v in self._name_to_id.items()}
+ self._lowercase_name_to_id = {k.lower(): v for k, v in self._name_to_id.items()}
def as_struct(self):
return self.struct
diff --git a/python/iceberg/api/types/type_util.py b/python/iceberg/api/types/type_util.py
index f7f8b13..5a465e5 100644
--- a/python/iceberg/api/types/type_util.py
+++ b/python/iceberg/api/types/type_util.py
@@ -24,6 +24,7 @@ from .types import (ListType,
MapType,
NestedField,
StructType)
+from ...exceptions import ValidationException
MAX_PRECISION = list()
REQUIRED_LENGTH = [-1 for item in range(40)]
@@ -121,7 +122,8 @@ def visit(arg, visitor): # noqa: ignore=C901
result = None
try:
result = visit(field.type, visitor)
- except RuntimeError:
+ except NotImplementedError:
+ # will remove it after missing functions are implemented.
pass
finally:
visitor.field_ids.pop()
@@ -133,7 +135,8 @@ def visit(arg, visitor): # noqa: ignore=C901
visitor.field_ids.append(list_var.element_id)
try:
element_result = visit(list_var.element_type, visitor)
- except RuntimeError:
+ except NotImplementedError:
+ # will remove it after missing functions are implemented.
pass
finally:
visitor.field_ids.pop()
@@ -352,8 +355,12 @@ class IndexByName(SchemaVisitor):
def add_field(self, name, field_id):
full_name = name
- if not self.field_names and len(self.field_names) > 0:
- full_name = IndexByName.DOT.join([IndexByName.DOT.join(reversed(self.field_names)), name])
+ if self.field_names is not None and len(self.field_names) > 0:
+ full_name = IndexByName.DOT.join([IndexByName.DOT.join(self.field_names), name])
+
+ existing_field_id = self.name_to_id.get(full_name)
+ ValidationException.check(existing_field_id is None, "Invalid schema: multiple fields for name %s: %s and %s",
+ (full_name, existing_field_id, field_id))
self.name_to_id[full_name] = field_id
diff --git a/python/tests/api/types/test_type_util.py b/python/tests/api/types/test_type_util.py
new file mode 100644
index 0000000..5803603
--- /dev/null
+++ b/python/tests/api/types/test_type_util.py
@@ -0,0 +1,49 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from iceberg.api.types import (BooleanType, NestedField, StructType)
+from iceberg.api.types import type_util
+from iceberg.exceptions import ValidationException
+import pytest
+
+
+def test_invalid_schema_via_index_by_name():
+ bool_type1 = NestedField.required(1, "a", BooleanType.get())
+ bool_type2 = NestedField.required(2, "a", BooleanType.get())
+
+ with pytest.raises(ValidationException) as context:
+ type_util.index_by_name(StructType.of([bool_type1, bool_type2]))
+ assert str(context.value) == 'Invalid schema: multiple fields for name a: 1 and 2'
+
+
+def test_valid_schema_via_index_by_name():
+ bool_type1 = NestedField.required(1, "a", BooleanType.get())
+ bool_type2 = NestedField.required(2, "b", BooleanType.get())
+
+ assert {'a': 1, 'b': 2} == type_util.index_by_name(StructType.of([bool_type1, bool_type2]))
+
+
+def test_validate_schema_via_index_by_name_for_nested_type():
+ nested_type = NestedField.required(
+ 1, "a", StructType.of(
+ [NestedField.required(2, "b", StructType.of(
+ [NestedField.required(3, "c", BooleanType.get())])),
+ NestedField.required(4, "b.c", BooleanType.get())]))
+
+ with pytest.raises(ValidationException) as context:
+ type_util.index_by_name(StructType.of([nested_type]))
+ assert str(context.value) == 'Invalid schema: multiple fields for name a.b.c: 3 and 4'