You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by bl...@apache.org on 2022/11/03 17:39:18 UTC
[iceberg] branch master updated: Python: Use Types from Typing (#6114)
This is an automated email from the ASF dual-hosted git repository.
blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new ddf0e27c6b Python: Use Types from Typing (#6114)
ddf0e27c6b is described below
commit ddf0e27c6b5304f3a7af7b5952ea36a3bcce4318
Author: Fokko Driesprong <fo...@apache.org>
AuthorDate: Thu Nov 3 18:39:12 2022 +0100
Python: Use Types from Typing (#6114)
---
python/.pre-commit-config.yaml | 2 +-
python/CONTRIBUTING.md | 6 ++++
python/pyiceberg/avro/codecs/__init__.py | 4 ++-
python/pyiceberg/avro/file.py | 9 +++---
python/pyiceberg/avro/reader.py | 19 +++++++----
python/pyiceberg/catalog/__init__.py | 49 ++++++++++++++++-------------
python/pyiceberg/expressions/literals.py | 29 ++++++++++-------
python/pyiceberg/utils/schema_conversion.py | 14 ++++++---
8 files changed, 82 insertions(+), 50 deletions(-)
diff --git a/python/.pre-commit-config.yaml b/python/.pre-commit-config.yaml
index 62b1881950..91452557d1 100644
--- a/python/.pre-commit-config.yaml
+++ b/python/.pre-commit-config.yaml
@@ -51,7 +51,7 @@ repos:
rev: v3.0.0
hooks:
- id: pyupgrade
- args: [--py38-plus]
+ args: [ --py38-plus, --keep-runtime-typing ]
- repo: https://github.com/pycqa/pylint
rev: v2.15.3
hooks:
diff --git a/python/CONTRIBUTING.md b/python/CONTRIBUTING.md
index d673b43920..ee2d23a82b 100644
--- a/python/CONTRIBUTING.md
+++ b/python/CONTRIBUTING.md
@@ -124,6 +124,12 @@ Which will warn:
Call to load_something, deprecated in 0.1.0, will be removed in 0.2.0. Please use load_something_else() instead.
```
+## Type annotations
+
+For the type annotation we currently rely on the `Typing` package that comes with Python.
+
+Since we're supporting from Python 3.8 onwards, we can't use the [type hints from the standard collections](https://peps.python.org/pep-0585/).
+
## Third party libraries
Since we expect PyIceberg to be integrated into the Python ecosystem, we want to be hesitant with the use of third party packages. Adding a lot of packages makes the library heavyweight, and causes incompatibilities with other projects if they use a different version of the library. Also, big libraries such as `s3fs`, `pyarrow`, `thrift` should be optional to avoid downloading everything, while not being sure if is actually being used.
diff --git a/python/pyiceberg/avro/codecs/__init__.py b/python/pyiceberg/avro/codecs/__init__.py
index 0512766280..8010b9397e 100644
--- a/python/pyiceberg/avro/codecs/__init__.py
+++ b/python/pyiceberg/avro/codecs/__init__.py
@@ -25,13 +25,15 @@ converting character sets (https://docs.python.org/3/library/codecs.html).
"""
from __future__ import annotations
+from typing import Dict, Optional, Type
+
from pyiceberg.avro.codecs.bzip2 import BZip2Codec
from pyiceberg.avro.codecs.codec import Codec
from pyiceberg.avro.codecs.deflate import DeflateCodec
from pyiceberg.avro.codecs.snappy_codec import SnappyCodec
from pyiceberg.avro.codecs.zstandard_codec import ZStandardCodec
-KNOWN_CODECS: dict[str, type[Codec] | None] = {
+KNOWN_CODECS: Dict[str, Optional[Type[Codec]]] = {
"null": None,
"bzip2": BZip2Codec,
"snappy": SnappyCodec,
diff --git a/python/pyiceberg/avro/file.py b/python/pyiceberg/avro/file.py
index cce705382f..f17056640f 100644
--- a/python/pyiceberg/avro/file.py
+++ b/python/pyiceberg/avro/file.py
@@ -23,6 +23,7 @@ from __future__ import annotations
import json
from dataclasses import dataclass
from io import SEEK_SET
+from typing import Optional, Type
from pyiceberg.avro.codecs import KNOWN_CODECS, Codec
from pyiceberg.avro.decoder import BinaryDecoder
@@ -65,7 +66,7 @@ class AvroFileHeader:
meta: dict[str, str]
sync: bytes
- def compression_codec(self) -> type[Codec] | None:
+ def compression_codec(self) -> Optional[Type[Codec]]:
"""Get the file's compression codec algorithm from the file's metadata.
In the case of a null codec, we return a None indicating that we
@@ -108,7 +109,7 @@ class Block:
class AvroFile:
input_file: InputFile
- read_schema: Schema | None
+ read_schema: Optional[Schema]
input_stream: InputStream
header: AvroFileHeader
schema: Schema
@@ -116,9 +117,9 @@ class AvroFile:
reader: StructReader
decoder: BinaryDecoder
- block: Block | None = None
+ block: Optional[Block] = None
- def __init__(self, input_file: InputFile, read_schema: Schema | None = None) -> None:
+ def __init__(self, input_file: InputFile, read_schema: Optional[Schema] = None) -> None:
self.input_file = input_file
self.read_schema = read_schema
diff --git a/python/pyiceberg/avro/reader.py b/python/pyiceberg/avro/reader.py
index 3de1105575..6b93b67937 100644
--- a/python/pyiceberg/avro/reader.py
+++ b/python/pyiceberg/avro/reader.py
@@ -31,7 +31,14 @@ from dataclasses import field as dataclassfield
from datetime import date, datetime, time
from decimal import Decimal
from functools import singledispatch
-from typing import Any, Callable
+from typing import (
+ Any,
+ Callable,
+ List,
+ Optional,
+ Tuple,
+ Union,
+)
from uuid import UUID
from pyiceberg.avro.decoder import BinaryDecoder
@@ -98,7 +105,7 @@ def _skip_map_array(decoder: BinaryDecoder, skip_entry: Callable) -> None:
@dataclass(frozen=True)
class AvroStruct(StructProtocol):
- _data: list[Any | StructProtocol] = dataclassfield()
+ _data: List[Union[Any, StructProtocol]] = dataclassfield()
def set(self, pos: int, value: Any) -> None:
self._data[pos] = value
@@ -242,7 +249,7 @@ class DecimalReader(Reader):
class OptionReader(Reader):
option: Reader = dataclassfield()
- def read(self, decoder: BinaryDecoder) -> Any | None:
+ def read(self, decoder: BinaryDecoder) -> Optional[Any]:
# For the Iceberg spec it is required to set the default value to null
# From https://iceberg.apache.org/spec/#avro
# Optional fields must always set the Avro field default value to null.
@@ -263,10 +270,10 @@ class OptionReader(Reader):
@dataclass(frozen=True)
class StructReader(Reader):
- fields: tuple[tuple[int | None, Reader], ...] = dataclassfield()
+ fields: Tuple[Tuple[Optional[int], Reader], ...] = dataclassfield()
def read(self, decoder: BinaryDecoder) -> AvroStruct:
- result: list[Any | StructProtocol] = [None] * len(self.fields)
+ result: List[Union[Any, StructProtocol]] = [None] * len(self.fields)
for (pos, field) in self.fields:
if pos is not None:
result[pos] = field.read(decoder)
@@ -332,7 +339,7 @@ class ConstructReader(SchemaVisitor[Reader]):
def schema(self, schema: Schema, struct_result: Reader) -> Reader:
return struct_result
- def struct(self, struct: StructType, field_results: list[Reader]) -> Reader:
+ def struct(self, struct: StructType, field_results: List[Reader]) -> Reader:
return StructReader(tuple(enumerate(field_results)))
def field(self, field: NestedField, field_result: Reader) -> Reader:
diff --git a/python/pyiceberg/catalog/__init__.py b/python/pyiceberg/catalog/__init__.py
index d4e3163c29..bb5cf417bf 100644
--- a/python/pyiceberg/catalog/__init__.py
+++ b/python/pyiceberg/catalog/__init__.py
@@ -21,7 +21,12 @@ import logging
from abc import ABC, abstractmethod
from dataclasses import dataclass
from enum import Enum
-from typing import Callable
+from typing import (
+ Callable,
+ List,
+ Optional,
+ Union,
+)
from pyiceberg.exceptions import NotInstalledError
from pyiceberg.schema import Schema
@@ -70,7 +75,7 @@ AVAILABLE_CATALOGS: dict[CatalogType, Callable[[str, Properties], Catalog]] = {
}
-def infer_catalog_type(name: str, catalog_properties: RecursiveDict) -> CatalogType | None:
+def infer_catalog_type(name: str, catalog_properties: RecursiveDict) -> Optional[CatalogType]:
"""Tries to infer the type based on the dict
Args:
@@ -97,7 +102,7 @@ def infer_catalog_type(name: str, catalog_properties: RecursiveDict) -> CatalogT
)
-def load_catalog(name: str, **properties: str | None) -> Catalog:
+def load_catalog(name: str, **properties: Optional[str]) -> Catalog:
"""Load the catalog based on the properties
Will look up the properties from the config, based on the name
@@ -116,7 +121,7 @@ def load_catalog(name: str, **properties: str | None) -> Catalog:
env = _ENV_CONFIG.get_catalog_config(name)
conf = merge_config(env or {}, properties)
- catalog_type: CatalogType | None
+ catalog_type: Optional[CatalogType]
if provided_catalog_type := conf.get(TYPE):
catalog_type = CatalogType[provided_catalog_type.upper()]
else:
@@ -130,9 +135,9 @@ def load_catalog(name: str, **properties: str | None) -> Catalog:
@dataclass
class PropertiesUpdateSummary:
- removed: list[str]
- updated: list[str]
- missing: list[str]
+ removed: List[str]
+ updated: List[str]
+ missing: List[str]
class Catalog(ABC):
@@ -159,9 +164,9 @@ class Catalog(ABC):
@abstractmethod
def create_table(
self,
- identifier: str | Identifier,
+ identifier: Union[str, Identifier],
schema: Schema,
- location: str | None = None,
+ location: Optional[str] = None,
partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC,
sort_order: SortOrder = UNSORTED_SORT_ORDER,
properties: Properties = EMPTY_DICT,
@@ -184,7 +189,7 @@ class Catalog(ABC):
"""
@abstractmethod
- def load_table(self, identifier: str | Identifier) -> Table:
+ def load_table(self, identifier: Union[str, Identifier]) -> Table:
"""Loads the table's metadata and returns the table instance.
You can also use this method to check for table existence using 'try catalog.table() except NoSuchTableError'
@@ -201,7 +206,7 @@ class Catalog(ABC):
"""
@abstractmethod
- def drop_table(self, identifier: str | Identifier) -> None:
+ def drop_table(self, identifier: Union[str, Identifier]) -> None:
"""Drop a table.
Args:
@@ -212,7 +217,7 @@ class Catalog(ABC):
"""
@abstractmethod
- def purge_table(self, identifier: str | Identifier) -> None:
+ def purge_table(self, identifier: Union[str, Identifier]) -> None:
"""Drop a table and purge all data and metadata files.
Args:
@@ -223,7 +228,7 @@ class Catalog(ABC):
"""
@abstractmethod
- def rename_table(self, from_identifier: str | Identifier, to_identifier: str | Identifier) -> Table:
+ def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: Union[str, Identifier]) -> Table:
"""Rename a fully classified table name
Args:
@@ -238,7 +243,7 @@ class Catalog(ABC):
"""
@abstractmethod
- def create_namespace(self, namespace: str | Identifier, properties: Properties = EMPTY_DICT) -> None:
+ def create_namespace(self, namespace: Union[str, Identifier], properties: Properties = EMPTY_DICT) -> None:
"""Create a namespace in the catalog.
Args:
@@ -250,7 +255,7 @@ class Catalog(ABC):
"""
@abstractmethod
- def drop_namespace(self, namespace: str | Identifier) -> None:
+ def drop_namespace(self, namespace: Union[str, Identifier]) -> None:
"""Drop a namespace.
Args:
@@ -262,7 +267,7 @@ class Catalog(ABC):
"""
@abstractmethod
- def list_tables(self, namespace: str | Identifier) -> list[Identifier]:
+ def list_tables(self, namespace: Union[str, Identifier]) -> List[Identifier]:
"""List tables under the given namespace in the catalog.
If namespace not provided, will list all tables in the catalog.
@@ -278,7 +283,7 @@ class Catalog(ABC):
"""
@abstractmethod
- def list_namespaces(self, namespace: str | Identifier = ()) -> list[Identifier]:
+ def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identifier]:
"""List namespaces from the given namespace. If not given, list top-level namespaces from the catalog.
Args:
@@ -292,7 +297,7 @@ class Catalog(ABC):
"""
@abstractmethod
- def load_namespace_properties(self, namespace: str | Identifier) -> Properties:
+ def load_namespace_properties(self, namespace: Union[str, Identifier]) -> Properties:
"""Get properties for a namespace.
Args:
@@ -307,7 +312,7 @@ class Catalog(ABC):
@abstractmethod
def update_namespace_properties(
- self, namespace: str | Identifier, removals: set[str] | None = None, updates: Properties = EMPTY_DICT
+ self, namespace: Union[str, Identifier], removals: set[str] | None = None, updates: Properties = EMPTY_DICT
) -> PropertiesUpdateSummary:
"""Removes provided property keys and updates properties for a namespace.
@@ -322,7 +327,7 @@ class Catalog(ABC):
"""
@staticmethod
- def identifier_to_tuple(identifier: str | Identifier) -> Identifier:
+ def identifier_to_tuple(identifier: Union[str, Identifier]) -> Identifier:
"""Parses an identifier to a tuple.
If the identifier is a string, it is split into a tuple on '.'. If it is a tuple, it is used as-is.
@@ -336,7 +341,7 @@ class Catalog(ABC):
return identifier if isinstance(identifier, tuple) else tuple(str.split(identifier, "."))
@staticmethod
- def table_name_from(identifier: str | Identifier) -> str:
+ def table_name_from(identifier: Union[str, Identifier]) -> str:
"""Extracts table name from a table identifier
Args:
@@ -348,7 +353,7 @@ class Catalog(ABC):
return Catalog.identifier_to_tuple(identifier)[-1]
@staticmethod
- def namespace_from(identifier: str | Identifier) -> Identifier:
+ def namespace_from(identifier: Union[str, Identifier]) -> Identifier:
"""Extracts table namespace from a table identifier
Args:
diff --git a/python/pyiceberg/expressions/literals.py b/python/pyiceberg/expressions/literals.py
index f49ee61805..458cbda41d 100644
--- a/python/pyiceberg/expressions/literals.py
+++ b/python/pyiceberg/expressions/literals.py
@@ -25,7 +25,12 @@ import struct
from abc import ABC, abstractmethod
from decimal import ROUND_HALF_UP, Decimal
from functools import singledispatch, singledispatchmethod
-from typing import Generic, TypeVar
+from typing import (
+ Generic,
+ Optional,
+ TypeVar,
+ Union,
+)
from uuid import UUID
from pyiceberg.types import (
@@ -217,7 +222,7 @@ class LongLiteral(Literal[int]):
return self
@to.register(IntegerType)
- def _(self, type_var: IntegerType) -> AboveMax | BelowMin | Literal[int]:
+ def _(self, _: IntegerType) -> Union[AboveMax, BelowMin, Literal[int]]:
if IntegerType.max < self.value:
return AboveMax()
elif IntegerType.min > self.value:
@@ -305,7 +310,7 @@ class DoubleLiteral(Literal[float]):
return self
@to.register(FloatType)
- def _(self, type_var: FloatType) -> AboveMax | BelowMin | Literal[float]:
+ def _(self, _: FloatType) -> Union[AboveMax, BelowMin, Literal[float]]:
if FloatType.max < self.value:
return AboveMax()
elif FloatType.min > self.value:
@@ -369,7 +374,7 @@ class DecimalLiteral(Literal[Decimal]):
return None
@to.register(DecimalType)
- def _(self, type_var: DecimalType) -> Literal[Decimal] | None:
+ def _(self, type_var: DecimalType) -> Optional[Literal[Decimal]]:
if type_var.scale == abs(self.value.as_tuple().exponent):
return self
return None
@@ -388,28 +393,28 @@ class StringLiteral(Literal[str]):
return self
@to.register(DateType)
- def _(self, type_var: DateType) -> Literal[int] | None:
+ def _(self, type_var: DateType) -> Optional[Literal[int]]:
try:
return DateLiteral(date_to_days(self.value))
except (TypeError, ValueError):
return None
@to.register(TimeType)
- def _(self, type_var: TimeType) -> Literal[int] | None:
+ def _(self, type_var: TimeType) -> Optional[Literal[int]]:
try:
return TimeLiteral(time_to_micros(self.value))
except (TypeError, ValueError):
return None
@to.register(TimestampType)
- def _(self, type_var: TimestampType) -> Literal[int] | None:
+ def _(self, type_var: TimestampType) -> Optional[Literal[int]]:
try:
return TimestampLiteral(timestamp_to_micros(self.value))
except (TypeError, ValueError):
return None
@to.register(TimestamptzType)
- def _(self, type_var: TimestamptzType) -> Literal[int] | None:
+ def _(self, type_var: TimestamptzType) -> Optional[Literal[int]]:
try:
return TimestampLiteral(timestamptz_to_micros(self.value))
except (TypeError, ValueError):
@@ -420,7 +425,7 @@ class StringLiteral(Literal[str]):
return UUIDLiteral(UUID(self.value))
@to.register(DecimalType)
- def _(self, type_var: DecimalType) -> Literal[Decimal] | None:
+ def _(self, type_var: DecimalType) -> Optional[Literal[Decimal]]:
dec = Decimal(self.value)
if type_var.scale == abs(dec.as_tuple().exponent):
return DecimalLiteral(dec)
@@ -450,7 +455,7 @@ class FixedLiteral(Literal[bytes]):
return None
@to.register(FixedType)
- def _(self, type_var: FixedType) -> Literal[bytes] | None:
+ def _(self, type_var: FixedType) -> Optional[Literal[bytes]]:
if len(self.value) == type_var.length:
return self
else:
@@ -470,11 +475,11 @@ class BinaryLiteral(Literal[bytes]):
return None
@to.register(BinaryType)
- def _(self, type_var: BinaryType) -> Literal[bytes]:
+ def _(self, _: BinaryType) -> Literal[bytes]:
return self
@to.register(FixedType)
- def _(self, type_var: FixedType) -> Literal[bytes] | None:
+ def _(self, type_var: FixedType) -> Optional[Literal[bytes]]:
if type_var.length == len(self.value):
return FixedLiteral(self.value)
else:
diff --git a/python/pyiceberg/utils/schema_conversion.py b/python/pyiceberg/utils/schema_conversion.py
index d6c32e0121..e3a9a437d1 100644
--- a/python/pyiceberg/utils/schema_conversion.py
+++ b/python/pyiceberg/utils/schema_conversion.py
@@ -18,7 +18,13 @@
from __future__ import annotations
import logging
-from typing import Any
+from typing import (
+ Any,
+ Dict,
+ List,
+ Tuple,
+ Union,
+)
from pyiceberg.schema import Schema
from pyiceberg.types import (
@@ -112,7 +118,7 @@ class AvroSchemaConversion:
"""
return Schema(*[self._convert_field(field) for field in avro_schema["fields"]], schema_id=1)
- def _resolve_union(self, type_union: dict | list | str) -> tuple[str | dict[str, Any], bool]:
+ def _resolve_union(self, type_union: Union[Dict, List, str]) -> Tuple[Union[str, Dict[str, Any]], bool]:
"""
Converts Unions into their type and resolves if the field is required
@@ -135,7 +141,7 @@ class AvroSchemaConversion:
Raises:
TypeError: In the case non-optional union types are encountered
"""
- avro_types: dict | list
+ avro_types: Union[Dict, List]
if isinstance(type_union, str):
# It is a primitive and required
return type_union, True
@@ -161,7 +167,7 @@ class AvroSchemaConversion:
# Filter the null value and return the type
return list(filter(lambda t: t != "null", avro_types))[0], False
- def _convert_schema(self, avro_type: str | dict[str, Any]) -> IcebergType:
+ def _convert_schema(self, avro_type: Union[str, Dict[str, Any]]) -> IcebergType:
"""
Resolves the Avro type