You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by fo...@apache.org on 2022/09/19 06:39:27 UTC
[iceberg] branch master updated: Python: Remove the pre-validators (#5686)
This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new 29772892fb Python: Remove the pre-validators (#5686)
29772892fb is described below
commit 29772892fbe38d3b6664492daebfc2a51b4b6f5f
Author: Fokko Driesprong <fo...@apache.org>
AuthorDate: Mon Sep 19 08:39:22 2022 +0200
Python: Remove the pre-validators (#5686)
I would like to remove the pre-validators because they are confusing.
Mostly because in the pre-validators the defaults and aliases aren't
applied, do we have to check all the cases. Removing those requires
setting defaults, and checking them afterward.
---
python/pyiceberg/table/metadata.py | 57 +++++++++++++++++++++-----------------
1 file changed, 32 insertions(+), 25 deletions(-)
diff --git a/python/pyiceberg/table/metadata.py b/python/pyiceberg/table/metadata.py
index b0818fc1e8..d3d1fbbfa5 100644
--- a/python/pyiceberg/table/metadata.py
+++ b/python/pyiceberg/table/metadata.py
@@ -43,17 +43,23 @@ from pyiceberg.typedef import EMPTY_DICT, Properties
from pyiceberg.utils.datetime import datetime_to_micros
from pyiceberg.utils.iceberg_base_model import IcebergBaseModel
+CURRENT_SNAPSHOT_ID = "current_snapshot_id"
+CURRENT_SCHEMA_ID = "current_schema_id"
+SCHEMAS = "schemas"
+PARTITION_SPECS = "partition_specs"
+SORT_ORDERS = "sort_orders"
+REFS = "refs"
+
INITIAL_SEQUENCE_NUMBER = 0
INITIAL_SPEC_ID = 0
DEFAULT_SCHEMA_ID = 0
-DEFAULT_LAST_PARTITION_ID = 1000
def check_schemas(values: Dict[str, Any]) -> Dict[str, Any]:
"""Validator to check if the current-schema-id is actually present in schemas"""
- current_schema_id = values["current_schema_id"]
+ current_schema_id = values[CURRENT_SCHEMA_ID]
- for schema in values["schemas"]:
+ for schema in values[SCHEMAS]:
if schema.schema_id == current_schema_id:
return values
@@ -64,7 +70,7 @@ def check_partition_specs(values: Dict[str, Any]) -> Dict[str, Any]:
"""Validator to check if the default-spec-id is present in partition-specs"""
default_spec_id = values["default_spec_id"]
- partition_specs: List[PartitionSpec] = values["partition_specs"]
+ partition_specs: List[PartitionSpec] = values[PARTITION_SPECS]
for spec in partition_specs:
if spec.spec_id == default_spec_id:
return values
@@ -77,7 +83,7 @@ def check_sort_orders(values: Dict[str, Any]) -> Dict[str, Any]:
default_sort_order_id: int = values["default_sort_order_id"]
if default_sort_order_id != UNSORTED_SORT_ORDER_ID:
- sort_orders: List[SortOrder] = values["sort_orders"]
+ sort_orders: List[SortOrder] = values[SORT_ORDERS]
for sort_order in sort_orders:
if sort_order.order_id == default_sort_order_id:
return values
@@ -90,20 +96,20 @@ class TableMetadataCommonFields(IcebergBaseModel):
"""Metadata for an Iceberg table as specified in the Apache Iceberg
spec (https://iceberg.apache.org/spec/#iceberg-table-spec)"""
- @root_validator(pre=True)
+ @root_validator(skip_on_failure=True)
def cleanup_snapshot_id(cls, data: Dict[str, Any]):
- if data.get("current-snapshot-id") == -1:
+ if data[CURRENT_SNAPSHOT_ID] == -1:
# We treat -1 and None the same, by cleaning this up
# in a pre-validator, we can simplify the logic later on
- data["current-snapshot-id"] = None
+ data[CURRENT_SNAPSHOT_ID] = None
return data
@root_validator(skip_on_failure=True)
def construct_refs(cls, data: Dict[str, Any]):
# This is going to be much nicer as soon as refs is an actual pydantic object
- if current_snapshot_id := data.get("current_snapshot_id"):
- if MAIN_BRANCH not in data["refs"]:
- data["refs"][MAIN_BRANCH] = SnapshotRef(snapshot_id=current_snapshot_id, snapshot_ref_type=SnapshotRefType.BRANCH)
+ if current_snapshot_id := data.get(CURRENT_SNAPSHOT_ID):
+ if MAIN_BRANCH not in data[REFS]:
+ data[REFS][MAIN_BRANCH] = SnapshotRef(snapshot_id=current_snapshot_id, snapshot_ref_type=SnapshotRefType.BRANCH)
return data
location: str = Field()
@@ -137,7 +143,7 @@ class TableMetadataCommonFields(IcebergBaseModel):
default_spec_id: int = Field(alias="default-spec-id", default=INITIAL_SPEC_ID)
"""ID of the “current” spec that writers should use by default."""
- last_partition_id: int = Field(alias="last-partition-id")
+ last_partition_id: Optional[int] = Field(alias="last-partition-id")
"""An integer; the highest assigned partition field ID across all
partition specs for the table. This is used to ensure partition fields
are always assigned an unused ID when evolving specs."""
@@ -203,24 +209,20 @@ class TableMetadataV1(TableMetadataCommonFields, IcebergBaseModel):
# because bumping the version should be an explicit operation that is up
# to the owner of the table.
- @root_validator(pre=True)
+ @root_validator
def set_v2_compatible_defaults(cls, data: Dict[str, Any]) -> Dict[str, Any]:
"""Sets default values to be compatible with the format v2
- Set some sensible defaults for V1, so we comply with the schema
- this is in pre=True, meaning that this will be done before validation.
- We don't want to make the fields optional, since they are required for V2
-
Args:
data: The raw arguments when initializing a V1 TableMetadata
Returns:
The TableMetadata with the defaults applied
"""
- if data.get("schema") and "schema-id" not in data["schema"]:
- data["schema"]["schema-id"] = DEFAULT_SCHEMA_ID
- if data.get("partition-spec") and "last-partition-id" not in data:
- data["last-partition-id"] = max(spec["field-id"] for spec in data["partition-spec"])
+ # When the schema doesn't have an ID
+ if data.get("schema") and "schema_id" not in data["schema"]:
+ data["schema"]["schema_id"] = DEFAULT_SCHEMA_ID
+
return data
@root_validator(skip_on_failure=True)
@@ -258,11 +260,16 @@ class TableMetadataV1(TableMetadataCommonFields, IcebergBaseModel):
Returns:
The TableMetadata with the partition_specs set, if not provided
"""
- if not data.get("partition_specs"):
+ if not data.get(PARTITION_SPECS):
fields = data["partition_spec"]
- data["partition_specs"] = [PartitionSpec(spec_id=INITIAL_SPEC_ID, fields=fields)]
+ data[PARTITION_SPECS] = [PartitionSpec(spec_id=INITIAL_SPEC_ID, fields=fields)]
else:
check_partition_specs(data)
+
+ if "last_partition_id" not in data or data.get("last_partition_id") is None:
+ if partition_specs := data.get(PARTITION_SPECS):
+ data["last_partition_id"] = max(spec.last_assigned_field_id for spec in partition_specs)
+
return data
@root_validator(skip_on_failure=True)
@@ -278,8 +285,8 @@ class TableMetadataV1(TableMetadataCommonFields, IcebergBaseModel):
Returns:
The TableMetadata with the sort_orders set, if not provided
"""
- if not data.get("sort_orders"):
- data["sort_orders"] = [UNSORTED_SORT_ORDER]
+ if not data.get(SORT_ORDERS):
+ data[SORT_ORDERS] = [UNSORTED_SORT_ORDER]
else:
check_sort_orders(data)
return data