You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by bl...@apache.org on 2022/06/13 20:15:33 UTC
[iceberg] branch master updated: Python: Add identity transform (#4908)
This is an automated email from the ASF dual-hosted git repository.
blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new c7d758c6a Python: Add identity transform (#4908)
c7d758c6a is described below
commit c7d758c6abae37783637b0aef2a56f07338f6bb5
Author: jun-he <ju...@users.noreply.github.com>
AuthorDate: Mon Jun 13 13:15:29 2022 -0700
Python: Add identity transform (#4908)
---
python/src/iceberg/transforms.py | 81 ++++++++++++++++++++++++++++++++++++
python/src/iceberg/utils/datetime.py | 34 ++++++++++++++-
python/tests/test_transforms.py | 48 +++++++++++++++++++++
3 files changed, 162 insertions(+), 1 deletion(-)
diff --git a/python/src/iceberg/transforms.py b/python/src/iceberg/transforms.py
index 39ec5d538..fbbdd917a 100644
--- a/python/src/iceberg/transforms.py
+++ b/python/src/iceberg/transforms.py
@@ -15,9 +15,11 @@
# specific language governing permissions and limitations
# under the License.
+import base64
import struct
from abc import ABC, abstractmethod
from decimal import Decimal
+from functools import singledispatchmethod
from typing import Generic, Optional, TypeVar
from uuid import UUID
@@ -37,6 +39,7 @@ from iceberg.types import (
TimeType,
UUIDType,
)
+from iceberg.utils import datetime
from iceberg.utils.decimal import decimal_to_bytes
S = TypeVar("S")
@@ -224,6 +227,80 @@ class BucketUUIDTransform(BaseBucketTransform):
)
+def _base64encode(buffer: bytes) -> str:
+ """Converts bytes to base64 string"""
+ return base64.b64encode(buffer).decode("ISO-8859-1")
+
+
+class IdentityTransform(Transform[S, S]):
+ """Transforms a value into itself.
+
+ Example:
+ >>> transform = IdentityTransform(StringType())
+ >>> transform.apply('hello-world')
+ 'hello-world'
+ """
+
+ def __init__(self, source_type: IcebergType):
+ super().__init__(
+ "identity",
+ f"transforms.identity(source_type={repr(source_type)})",
+ )
+ self._type = source_type
+
+ def apply(self, value: Optional[S]) -> Optional[S]:
+ return value
+
+ def can_transform(self, source: IcebergType) -> bool:
+ return source.is_primitive
+
+ def result_type(self, source: IcebergType) -> IcebergType:
+ return source
+
+ @property
+ def preserves_order(self) -> bool:
+ return True
+
+ def satisfies_order_of(self, other: Transform) -> bool:
+ """ordering by value is the same as long as the other preserves order"""
+ return other.preserves_order
+
+ def to_human_string(self, value: Optional[S]) -> str:
+ return self._human_string(value)
+
+ @singledispatchmethod
+ def _human_string(self, value: Optional[S]) -> str:
+ return str(value) if value is not None else "null"
+
+ @_human_string.register(bytes)
+ def _(self, value: bytes) -> str:
+ return _base64encode(value)
+
+ @_human_string.register(int)
+ def _(self, value: int) -> str:
+ return self._int_to_human_string(self._type, value)
+
+ @singledispatchmethod
+ def _int_to_human_string(self, value_type: IcebergType, value: int) -> str:
+ return str(value)
+
+ @_int_to_human_string.register(DateType)
+ def _(self, value_type: IcebergType, value: int) -> str:
+ return datetime.to_human_day(value)
+
+ @_int_to_human_string.register(TimeType)
+ def _(self, value_type: IcebergType, value: int) -> str:
+ return datetime.to_human_time(value)
+
+ @_int_to_human_string.register(TimestampType)
+ def _(self, value_type: IcebergType, value: int) -> str:
+ return datetime.to_human_timestamp(value)
+
+ @_int_to_human_string.register(TimestamptzType)
+ def _(self, value_type: IcebergType, value: int) -> str:
+ return datetime.to_human_timestamptz(value)
+
+
class UnknownTransform(Transform):
"""A transform that represents when an unknown transform is provided
Args:
@@ -294,5 +371,9 @@ def bucket(source_type: IcebergType, num_buckets: int) -> BaseBucketTransform:
raise ValueError(f"Cannot bucket by type: {source_type}")
+def identity(source_type: IcebergType) -> IdentityTransform:
+ return IdentityTransform(source_type)
+
+
def always_null() -> Transform:
return VoidTransform()
diff --git a/python/src/iceberg/utils/datetime.py b/python/src/iceberg/utils/datetime.py
index c8c12393b..c10dda4fb 100644
--- a/python/src/iceberg/utils/datetime.py
+++ b/python/src/iceberg/utils/datetime.py
@@ -17,7 +17,12 @@
"""Helper methods for working with date/time representations
"""
import re
-from datetime import date, datetime, time
+from datetime import (
+ date,
+ datetime,
+ time,
+ timedelta,
+)
EPOCH_DATE = date.fromisoformat("1970-01-01")
EPOCH_TIMESTAMP = datetime.fromisoformat("1970-01-01T00:00:00.000000")
@@ -42,6 +47,13 @@ def time_to_micros(time_str: str) -> int:
return (((t.hour * 60 + t.minute) * 60) + t.second) * 1_000_000 + t.microsecond
+def time_from_micros(micros: int) -> time:
+ seconds = micros // 1_000_000
+ minutes = seconds // 60
+ hours = minutes // 60
+ return time(hour=hours, minute=minutes % 60, second=seconds % 60, microsecond=micros % 1_000_000)
+
+
def datetime_to_micros(dt: datetime) -> int:
"""Converts a datetime to microseconds from 1970-01-01T00:00:00.000000"""
if dt.tzinfo:
@@ -63,3 +75,23 @@ def timestamptz_to_micros(timestamptz_str: str) -> int:
if ISO_TIMESTAMPTZ.fullmatch(timestamptz_str):
return datetime_to_micros(datetime.fromisoformat(timestamptz_str))
raise ValueError(f"Invalid timestamp with zone: {timestamptz_str} (must be ISO-8601)")
+
+
+def to_human_day(day_ordinal: int) -> str:
+ """Converts a DateType value to human string"""
+ return (EPOCH_DATE + timedelta(days=day_ordinal)).isoformat()
+
+
+def to_human_time(micros_from_midnight: int) -> str:
+ """Converts a TimeType value to human string"""
+ return time_from_micros(micros_from_midnight).isoformat()
+
+
+def to_human_timestamptz(timestamp_micros: int) -> str:
+ """Converts a TimestamptzType value to human string"""
+ return (EPOCH_TIMESTAMPTZ + timedelta(microseconds=timestamp_micros)).isoformat()
+
+
+def to_human_timestamp(timestamp_micros: int) -> str:
+ """Converts a TimestampType value to human string"""
+ return (EPOCH_TIMESTAMP + timedelta(microseconds=timestamp_micros)).isoformat()
diff --git a/python/tests/test_transforms.py b/python/tests/test_transforms.py
index 7855345f9..d7b8e968a 100644
--- a/python/tests/test_transforms.py
+++ b/python/tests/test_transforms.py
@@ -28,7 +28,9 @@ from iceberg.types import (
BooleanType,
DateType,
DecimalType,
+ DoubleType,
FixedType,
+ FloatType,
IntegerType,
LongType,
StringType,
@@ -129,6 +131,52 @@ def test_string_with_surrogate_pair():
assert bucket_transform.hash(string_with_surrogate_pair) == mmh3.hash(as_bytes)
+@pytest.mark.parametrize(
+ "type_var,value,expected",
+ [
+ (LongType(), None, "null"),
+ (DateType(), 17501, "2017-12-01"),
+ (TimeType(), 36775038194, "10:12:55.038194"),
+ (TimestamptzType(), 1512151975038194, "2017-12-01T18:12:55.038194+00:00"),
+ (TimestampType(), 1512151975038194, "2017-12-01T18:12:55.038194"),
+ (LongType(), -1234567890000, "-1234567890000"),
+ (StringType(), "a/b/c=d", "a/b/c=d"),
+ (DecimalType(9, 2), Decimal("-1.50"), "-1.50"),
+ (FixedType(100), b"foo", "Zm9v"),
+ ],
+)
+def test_identity_human_string(type_var, value, expected):
+ identity = transforms.identity(type_var)
+ assert identity.to_human_string(value) == expected
+
+
+@pytest.mark.parametrize(
+ "type_var",
+ [
+ BinaryType(),
+ BooleanType(),
+ DateType(),
+ DecimalType(8, 2),
+ DoubleType(),
+ FixedType(16),
+ FloatType(),
+ IntegerType(),
+ LongType(),
+ StringType(),
+ TimestampType(),
+ TimestamptzType(),
+ TimeType(),
+ UUIDType(),
+ ],
+)
+def test_identity_method(type_var):
+ identity_transform = transforms.identity(type_var)
+ assert str(identity_transform) == str(eval(repr(identity_transform)))
+ assert identity_transform.can_transform(type_var)
+ assert identity_transform.result_type(type_var) == type_var
+ assert identity_transform.apply("test") == "test"
+
+
def test_unknown_transform():
unknown_transform = transforms.UnknownTransform(FixedType(8), "unknown")
assert str(unknown_transform) == str(eval(repr(unknown_transform)))