You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by bl...@apache.org on 2022/11/03 17:39:18 UTC

[iceberg] branch master updated: Python: Use Types from Typing (#6114)

This is an automated email from the ASF dual-hosted git repository.

blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git


The following commit(s) were added to refs/heads/master by this push:
     new ddf0e27c6b Python: Use Types from Typing (#6114)
ddf0e27c6b is described below

commit ddf0e27c6b5304f3a7af7b5952ea36a3bcce4318
Author: Fokko Driesprong <fo...@apache.org>
AuthorDate: Thu Nov 3 18:39:12 2022 +0100

    Python: Use Types from Typing (#6114)
---
 python/.pre-commit-config.yaml              |  2 +-
 python/CONTRIBUTING.md                      |  6 ++++
 python/pyiceberg/avro/codecs/__init__.py    |  4 ++-
 python/pyiceberg/avro/file.py               |  9 +++---
 python/pyiceberg/avro/reader.py             | 19 +++++++----
 python/pyiceberg/catalog/__init__.py        | 49 ++++++++++++++++-------------
 python/pyiceberg/expressions/literals.py    | 29 ++++++++++-------
 python/pyiceberg/utils/schema_conversion.py | 14 ++++++---
 8 files changed, 82 insertions(+), 50 deletions(-)

diff --git a/python/.pre-commit-config.yaml b/python/.pre-commit-config.yaml
index 62b1881950..91452557d1 100644
--- a/python/.pre-commit-config.yaml
+++ b/python/.pre-commit-config.yaml
@@ -51,7 +51,7 @@ repos:
     rev: v3.0.0
     hooks:
       - id: pyupgrade
-        args: [--py38-plus]
+        args: [ --py38-plus, --keep-runtime-typing ]
   - repo: https://github.com/pycqa/pylint
     rev: v2.15.3
     hooks:
diff --git a/python/CONTRIBUTING.md b/python/CONTRIBUTING.md
index d673b43920..ee2d23a82b 100644
--- a/python/CONTRIBUTING.md
+++ b/python/CONTRIBUTING.md
@@ -124,6 +124,12 @@ Which will warn:
 Call to load_something, deprecated in 0.1.0, will be removed in 0.2.0. Please use load_something_else() instead.
 ```
 
+## Type annotations
+
+For the type annotation we currently rely on the `Typing` package that comes with Python.
+
+Since we're supporting from Python 3.8 onwards, we can't use the [type hints from the standard collections](https://peps.python.org/pep-0585/).
+
 ## Third party libraries
 
 Since we expect PyIceberg to be integrated into the Python ecosystem, we want to be hesitant with the use of third party packages. Adding a lot of packages makes the library heavyweight, and causes incompatibilities with other projects if they use a different version of the library. Also, big libraries such as `s3fs`, `pyarrow`, `thrift` should be optional to avoid downloading everything, while not being sure if is actually being used.
diff --git a/python/pyiceberg/avro/codecs/__init__.py b/python/pyiceberg/avro/codecs/__init__.py
index 0512766280..8010b9397e 100644
--- a/python/pyiceberg/avro/codecs/__init__.py
+++ b/python/pyiceberg/avro/codecs/__init__.py
@@ -25,13 +25,15 @@ converting character sets (https://docs.python.org/3/library/codecs.html).
 """
 from __future__ import annotations
 
+from typing import Dict, Optional, Type
+
 from pyiceberg.avro.codecs.bzip2 import BZip2Codec
 from pyiceberg.avro.codecs.codec import Codec
 from pyiceberg.avro.codecs.deflate import DeflateCodec
 from pyiceberg.avro.codecs.snappy_codec import SnappyCodec
 from pyiceberg.avro.codecs.zstandard_codec import ZStandardCodec
 
-KNOWN_CODECS: dict[str, type[Codec] | None] = {
+KNOWN_CODECS: Dict[str, Optional[Type[Codec]]] = {
     "null": None,
     "bzip2": BZip2Codec,
     "snappy": SnappyCodec,
diff --git a/python/pyiceberg/avro/file.py b/python/pyiceberg/avro/file.py
index cce705382f..f17056640f 100644
--- a/python/pyiceberg/avro/file.py
+++ b/python/pyiceberg/avro/file.py
@@ -23,6 +23,7 @@ from __future__ import annotations
 import json
 from dataclasses import dataclass
 from io import SEEK_SET
+from typing import Optional, Type
 
 from pyiceberg.avro.codecs import KNOWN_CODECS, Codec
 from pyiceberg.avro.decoder import BinaryDecoder
@@ -65,7 +66,7 @@ class AvroFileHeader:
     meta: dict[str, str]
     sync: bytes
 
-    def compression_codec(self) -> type[Codec] | None:
+    def compression_codec(self) -> Optional[Type[Codec]]:
         """Get the file's compression codec algorithm from the file's metadata.
 
         In the case of a null codec, we return a None indicating that we
@@ -108,7 +109,7 @@ class Block:
 
 class AvroFile:
     input_file: InputFile
-    read_schema: Schema | None
+    read_schema: Optional[Schema]
     input_stream: InputStream
     header: AvroFileHeader
     schema: Schema
@@ -116,9 +117,9 @@ class AvroFile:
     reader: StructReader
 
     decoder: BinaryDecoder
-    block: Block | None = None
+    block: Optional[Block] = None
 
-    def __init__(self, input_file: InputFile, read_schema: Schema | None = None) -> None:
+    def __init__(self, input_file: InputFile, read_schema: Optional[Schema] = None) -> None:
         self.input_file = input_file
         self.read_schema = read_schema
 
diff --git a/python/pyiceberg/avro/reader.py b/python/pyiceberg/avro/reader.py
index 3de1105575..6b93b67937 100644
--- a/python/pyiceberg/avro/reader.py
+++ b/python/pyiceberg/avro/reader.py
@@ -31,7 +31,14 @@ from dataclasses import field as dataclassfield
 from datetime import date, datetime, time
 from decimal import Decimal
 from functools import singledispatch
-from typing import Any, Callable
+from typing import (
+    Any,
+    Callable,
+    List,
+    Optional,
+    Tuple,
+    Union,
+)
 from uuid import UUID
 
 from pyiceberg.avro.decoder import BinaryDecoder
@@ -98,7 +105,7 @@ def _skip_map_array(decoder: BinaryDecoder, skip_entry: Callable) -> None:
 
 @dataclass(frozen=True)
 class AvroStruct(StructProtocol):
-    _data: list[Any | StructProtocol] = dataclassfield()
+    _data: List[Union[Any, StructProtocol]] = dataclassfield()
 
     def set(self, pos: int, value: Any) -> None:
         self._data[pos] = value
@@ -242,7 +249,7 @@ class DecimalReader(Reader):
 class OptionReader(Reader):
     option: Reader = dataclassfield()
 
-    def read(self, decoder: BinaryDecoder) -> Any | None:
+    def read(self, decoder: BinaryDecoder) -> Optional[Any]:
         # For the Iceberg spec it is required to set the default value to null
         # From https://iceberg.apache.org/spec/#avro
         # Optional fields must always set the Avro field default value to null.
@@ -263,10 +270,10 @@ class OptionReader(Reader):
 
 @dataclass(frozen=True)
 class StructReader(Reader):
-    fields: tuple[tuple[int | None, Reader], ...] = dataclassfield()
+    fields: Tuple[Tuple[Optional[int], Reader], ...] = dataclassfield()
 
     def read(self, decoder: BinaryDecoder) -> AvroStruct:
-        result: list[Any | StructProtocol] = [None] * len(self.fields)
+        result: List[Union[Any, StructProtocol]] = [None] * len(self.fields)
         for (pos, field) in self.fields:
             if pos is not None:
                 result[pos] = field.read(decoder)
@@ -332,7 +339,7 @@ class ConstructReader(SchemaVisitor[Reader]):
     def schema(self, schema: Schema, struct_result: Reader) -> Reader:
         return struct_result
 
-    def struct(self, struct: StructType, field_results: list[Reader]) -> Reader:
+    def struct(self, struct: StructType, field_results: List[Reader]) -> Reader:
         return StructReader(tuple(enumerate(field_results)))
 
     def field(self, field: NestedField, field_result: Reader) -> Reader:
diff --git a/python/pyiceberg/catalog/__init__.py b/python/pyiceberg/catalog/__init__.py
index d4e3163c29..bb5cf417bf 100644
--- a/python/pyiceberg/catalog/__init__.py
+++ b/python/pyiceberg/catalog/__init__.py
@@ -21,7 +21,12 @@ import logging
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from enum import Enum
-from typing import Callable
+from typing import (
+    Callable,
+    List,
+    Optional,
+    Union,
+)
 
 from pyiceberg.exceptions import NotInstalledError
 from pyiceberg.schema import Schema
@@ -70,7 +75,7 @@ AVAILABLE_CATALOGS: dict[CatalogType, Callable[[str, Properties], Catalog]] = {
 }
 
 
-def infer_catalog_type(name: str, catalog_properties: RecursiveDict) -> CatalogType | None:
+def infer_catalog_type(name: str, catalog_properties: RecursiveDict) -> Optional[CatalogType]:
     """Tries to infer the type based on the dict
 
     Args:
@@ -97,7 +102,7 @@ def infer_catalog_type(name: str, catalog_properties: RecursiveDict) -> CatalogT
     )
 
 
-def load_catalog(name: str, **properties: str | None) -> Catalog:
+def load_catalog(name: str, **properties: Optional[str]) -> Catalog:
     """Load the catalog based on the properties
 
     Will look up the properties from the config, based on the name
@@ -116,7 +121,7 @@ def load_catalog(name: str, **properties: str | None) -> Catalog:
     env = _ENV_CONFIG.get_catalog_config(name)
     conf = merge_config(env or {}, properties)
 
-    catalog_type: CatalogType | None
+    catalog_type: Optional[CatalogType]
     if provided_catalog_type := conf.get(TYPE):
         catalog_type = CatalogType[provided_catalog_type.upper()]
     else:
@@ -130,9 +135,9 @@ def load_catalog(name: str, **properties: str | None) -> Catalog:
 
 @dataclass
 class PropertiesUpdateSummary:
-    removed: list[str]
-    updated: list[str]
-    missing: list[str]
+    removed: List[str]
+    updated: List[str]
+    missing: List[str]
 
 
 class Catalog(ABC):
@@ -159,9 +164,9 @@ class Catalog(ABC):
     @abstractmethod
     def create_table(
         self,
-        identifier: str | Identifier,
+        identifier: Union[str, Identifier],
         schema: Schema,
-        location: str | None = None,
+        location: Optional[str] = None,
         partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC,
         sort_order: SortOrder = UNSORTED_SORT_ORDER,
         properties: Properties = EMPTY_DICT,
@@ -184,7 +189,7 @@ class Catalog(ABC):
         """
 
     @abstractmethod
-    def load_table(self, identifier: str | Identifier) -> Table:
+    def load_table(self, identifier: Union[str, Identifier]) -> Table:
         """Loads the table's metadata and returns the table instance.
 
         You can also use this method to check for table existence using 'try catalog.table() except NoSuchTableError'
@@ -201,7 +206,7 @@ class Catalog(ABC):
         """
 
     @abstractmethod
-    def drop_table(self, identifier: str | Identifier) -> None:
+    def drop_table(self, identifier: Union[str, Identifier]) -> None:
         """Drop a table.
 
         Args:
@@ -212,7 +217,7 @@ class Catalog(ABC):
         """
 
     @abstractmethod
-    def purge_table(self, identifier: str | Identifier) -> None:
+    def purge_table(self, identifier: Union[str, Identifier]) -> None:
         """Drop a table and purge all data and metadata files.
 
         Args:
@@ -223,7 +228,7 @@ class Catalog(ABC):
         """
 
     @abstractmethod
-    def rename_table(self, from_identifier: str | Identifier, to_identifier: str | Identifier) -> Table:
+    def rename_table(self, from_identifier: Union[str, Identifier], to_identifier: Union[str, Identifier]) -> Table:
         """Rename a fully classified table name
 
         Args:
@@ -238,7 +243,7 @@ class Catalog(ABC):
         """
 
     @abstractmethod
-    def create_namespace(self, namespace: str | Identifier, properties: Properties = EMPTY_DICT) -> None:
+    def create_namespace(self, namespace: Union[str, Identifier], properties: Properties = EMPTY_DICT) -> None:
         """Create a namespace in the catalog.
 
         Args:
@@ -250,7 +255,7 @@ class Catalog(ABC):
         """
 
     @abstractmethod
-    def drop_namespace(self, namespace: str | Identifier) -> None:
+    def drop_namespace(self, namespace: Union[str, Identifier]) -> None:
         """Drop a namespace.
 
         Args:
@@ -262,7 +267,7 @@ class Catalog(ABC):
         """
 
     @abstractmethod
-    def list_tables(self, namespace: str | Identifier) -> list[Identifier]:
+    def list_tables(self, namespace: Union[str, Identifier]) -> List[Identifier]:
         """List tables under the given namespace in the catalog.
 
         If namespace not provided, will list all tables in the catalog.
@@ -278,7 +283,7 @@ class Catalog(ABC):
         """
 
     @abstractmethod
-    def list_namespaces(self, namespace: str | Identifier = ()) -> list[Identifier]:
+    def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identifier]:
         """List namespaces from the given namespace. If not given, list top-level namespaces from the catalog.
 
         Args:
@@ -292,7 +297,7 @@ class Catalog(ABC):
         """
 
     @abstractmethod
-    def load_namespace_properties(self, namespace: str | Identifier) -> Properties:
+    def load_namespace_properties(self, namespace: Union[str, Identifier]) -> Properties:
         """Get properties for a namespace.
 
         Args:
@@ -307,7 +312,7 @@ class Catalog(ABC):
 
     @abstractmethod
     def update_namespace_properties(
-        self, namespace: str | Identifier, removals: set[str] | None = None, updates: Properties = EMPTY_DICT
+        self, namespace: Union[str, Identifier], removals: set[str] | None = None, updates: Properties = EMPTY_DICT
     ) -> PropertiesUpdateSummary:
         """Removes provided property keys and updates properties for a namespace.
 
@@ -322,7 +327,7 @@ class Catalog(ABC):
         """
 
     @staticmethod
-    def identifier_to_tuple(identifier: str | Identifier) -> Identifier:
+    def identifier_to_tuple(identifier: Union[str, Identifier]) -> Identifier:
         """Parses an identifier to a tuple.
 
         If the identifier is a string, it is split into a tuple on '.'. If it is a tuple, it is used as-is.
@@ -336,7 +341,7 @@ class Catalog(ABC):
         return identifier if isinstance(identifier, tuple) else tuple(str.split(identifier, "."))
 
     @staticmethod
-    def table_name_from(identifier: str | Identifier) -> str:
+    def table_name_from(identifier: Union[str, Identifier]) -> str:
         """Extracts table name from a table identifier
 
         Args:
@@ -348,7 +353,7 @@ class Catalog(ABC):
         return Catalog.identifier_to_tuple(identifier)[-1]
 
     @staticmethod
-    def namespace_from(identifier: str | Identifier) -> Identifier:
+    def namespace_from(identifier: Union[str, Identifier]) -> Identifier:
         """Extracts table namespace from a table identifier
 
         Args:
diff --git a/python/pyiceberg/expressions/literals.py b/python/pyiceberg/expressions/literals.py
index f49ee61805..458cbda41d 100644
--- a/python/pyiceberg/expressions/literals.py
+++ b/python/pyiceberg/expressions/literals.py
@@ -25,7 +25,12 @@ import struct
 from abc import ABC, abstractmethod
 from decimal import ROUND_HALF_UP, Decimal
 from functools import singledispatch, singledispatchmethod
-from typing import Generic, TypeVar
+from typing import (
+    Generic,
+    Optional,
+    TypeVar,
+    Union,
+)
 from uuid import UUID
 
 from pyiceberg.types import (
@@ -217,7 +222,7 @@ class LongLiteral(Literal[int]):
         return self
 
     @to.register(IntegerType)
-    def _(self, type_var: IntegerType) -> AboveMax | BelowMin | Literal[int]:
+    def _(self, _: IntegerType) -> Union[AboveMax, BelowMin, Literal[int]]:
         if IntegerType.max < self.value:
             return AboveMax()
         elif IntegerType.min > self.value:
@@ -305,7 +310,7 @@ class DoubleLiteral(Literal[float]):
         return self
 
     @to.register(FloatType)
-    def _(self, type_var: FloatType) -> AboveMax | BelowMin | Literal[float]:
+    def _(self, _: FloatType) -> Union[AboveMax, BelowMin, Literal[float]]:
         if FloatType.max < self.value:
             return AboveMax()
         elif FloatType.min > self.value:
@@ -369,7 +374,7 @@ class DecimalLiteral(Literal[Decimal]):
         return None
 
     @to.register(DecimalType)
-    def _(self, type_var: DecimalType) -> Literal[Decimal] | None:
+    def _(self, type_var: DecimalType) -> Optional[Literal[Decimal]]:
         if type_var.scale == abs(self.value.as_tuple().exponent):
             return self
         return None
@@ -388,28 +393,28 @@ class StringLiteral(Literal[str]):
         return self
 
     @to.register(DateType)
-    def _(self, type_var: DateType) -> Literal[int] | None:
+    def _(self, type_var: DateType) -> Optional[Literal[int]]:
         try:
             return DateLiteral(date_to_days(self.value))
         except (TypeError, ValueError):
             return None
 
     @to.register(TimeType)
-    def _(self, type_var: TimeType) -> Literal[int] | None:
+    def _(self, type_var: TimeType) -> Optional[Literal[int]]:
         try:
             return TimeLiteral(time_to_micros(self.value))
         except (TypeError, ValueError):
             return None
 
     @to.register(TimestampType)
-    def _(self, type_var: TimestampType) -> Literal[int] | None:
+    def _(self, type_var: TimestampType) -> Optional[Literal[int]]:
         try:
             return TimestampLiteral(timestamp_to_micros(self.value))
         except (TypeError, ValueError):
             return None
 
     @to.register(TimestamptzType)
-    def _(self, type_var: TimestamptzType) -> Literal[int] | None:
+    def _(self, type_var: TimestamptzType) -> Optional[Literal[int]]:
         try:
             return TimestampLiteral(timestamptz_to_micros(self.value))
         except (TypeError, ValueError):
@@ -420,7 +425,7 @@ class StringLiteral(Literal[str]):
         return UUIDLiteral(UUID(self.value))
 
     @to.register(DecimalType)
-    def _(self, type_var: DecimalType) -> Literal[Decimal] | None:
+    def _(self, type_var: DecimalType) -> Optional[Literal[Decimal]]:
         dec = Decimal(self.value)
         if type_var.scale == abs(dec.as_tuple().exponent):
             return DecimalLiteral(dec)
@@ -450,7 +455,7 @@ class FixedLiteral(Literal[bytes]):
         return None
 
     @to.register(FixedType)
-    def _(self, type_var: FixedType) -> Literal[bytes] | None:
+    def _(self, type_var: FixedType) -> Optional[Literal[bytes]]:
         if len(self.value) == type_var.length:
             return self
         else:
@@ -470,11 +475,11 @@ class BinaryLiteral(Literal[bytes]):
         return None
 
     @to.register(BinaryType)
-    def _(self, type_var: BinaryType) -> Literal[bytes]:
+    def _(self, _: BinaryType) -> Literal[bytes]:
         return self
 
     @to.register(FixedType)
-    def _(self, type_var: FixedType) -> Literal[bytes] | None:
+    def _(self, type_var: FixedType) -> Optional[Literal[bytes]]:
         if type_var.length == len(self.value):
             return FixedLiteral(self.value)
         else:
diff --git a/python/pyiceberg/utils/schema_conversion.py b/python/pyiceberg/utils/schema_conversion.py
index d6c32e0121..e3a9a437d1 100644
--- a/python/pyiceberg/utils/schema_conversion.py
+++ b/python/pyiceberg/utils/schema_conversion.py
@@ -18,7 +18,13 @@
 from __future__ import annotations
 
 import logging
-from typing import Any
+from typing import (
+    Any,
+    Dict,
+    List,
+    Tuple,
+    Union,
+)
 
 from pyiceberg.schema import Schema
 from pyiceberg.types import (
@@ -112,7 +118,7 @@ class AvroSchemaConversion:
         """
         return Schema(*[self._convert_field(field) for field in avro_schema["fields"]], schema_id=1)
 
-    def _resolve_union(self, type_union: dict | list | str) -> tuple[str | dict[str, Any], bool]:
+    def _resolve_union(self, type_union: Union[Dict, List, str]) -> Tuple[Union[str, Dict[str, Any]], bool]:
         """
         Converts Unions into their type and resolves if the field is required
 
@@ -135,7 +141,7 @@ class AvroSchemaConversion:
         Raises:
             TypeError: In the case non-optional union types are encountered
         """
-        avro_types: dict | list
+        avro_types: Union[Dict, List]
         if isinstance(type_union, str):
             # It is a primitive and required
             return type_union, True
@@ -161,7 +167,7 @@ class AvroSchemaConversion:
         # Filter the null value and return the type
         return list(filter(lambda t: t != "null", avro_types))[0], False
 
-    def _convert_schema(self, avro_type: str | dict[str, Any]) -> IcebergType:
+    def _convert_schema(self, avro_type: Union[str, Dict[str, Any]]) -> IcebergType:
         """
         Resolves the Avro type