You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by bl...@apache.org on 2022/06/13 20:15:33 UTC

[iceberg] branch master updated: Python: Add identity transform (#4908)

This is an automated email from the ASF dual-hosted git repository.

blue pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git


The following commit(s) were added to refs/heads/master by this push:
     new c7d758c6a Python: Add identity transform (#4908)
c7d758c6a is described below

commit c7d758c6abae37783637b0aef2a56f07338f6bb5
Author: jun-he <ju...@users.noreply.github.com>
AuthorDate: Mon Jun 13 13:15:29 2022 -0700

    Python: Add identity transform (#4908)
---
 python/src/iceberg/transforms.py     | 81 ++++++++++++++++++++++++++++++++++++
 python/src/iceberg/utils/datetime.py | 34 ++++++++++++++-
 python/tests/test_transforms.py      | 48 +++++++++++++++++++++
 3 files changed, 162 insertions(+), 1 deletion(-)

diff --git a/python/src/iceberg/transforms.py b/python/src/iceberg/transforms.py
index 39ec5d538..fbbdd917a 100644
--- a/python/src/iceberg/transforms.py
+++ b/python/src/iceberg/transforms.py
@@ -15,9 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import base64
 import struct
 from abc import ABC, abstractmethod
 from decimal import Decimal
+from functools import singledispatchmethod
 from typing import Generic, Optional, TypeVar
 from uuid import UUID
 
@@ -37,6 +39,7 @@ from iceberg.types import (
     TimeType,
     UUIDType,
 )
+from iceberg.utils import datetime
 from iceberg.utils.decimal import decimal_to_bytes
 
 S = TypeVar("S")
@@ -224,6 +227,80 @@ class BucketUUIDTransform(BaseBucketTransform):
         )
 
 
+def _base64encode(buffer: bytes) -> str:
+    """Converts bytes to base64 string"""
+    return base64.b64encode(buffer).decode("ISO-8859-1")
+
+
+class IdentityTransform(Transform[S, S]):
+    """Transforms a value into itself.
+
+    Example:
+        >>> transform = IdentityTransform(StringType())
+        >>> transform.apply('hello-world')
+        'hello-world'
+    """
+
+    def __init__(self, source_type: IcebergType):
+        super().__init__(
+            "identity",
+            f"transforms.identity(source_type={repr(source_type)})",
+        )
+        self._type = source_type
+
+    def apply(self, value: Optional[S]) -> Optional[S]:
+        return value
+
+    def can_transform(self, source: IcebergType) -> bool:
+        return source.is_primitive
+
+    def result_type(self, source: IcebergType) -> IcebergType:
+        return source
+
+    @property
+    def preserves_order(self) -> bool:
+        return True
+
+    def satisfies_order_of(self, other: Transform) -> bool:
+        """ordering by value is the same as long as the other preserves order"""
+        return other.preserves_order
+
+    def to_human_string(self, value: Optional[S]) -> str:
+        return self._human_string(value)
+
+    @singledispatchmethod
+    def _human_string(self, value: Optional[S]) -> str:
+        return str(value) if value is not None else "null"
+
+    @_human_string.register(bytes)
+    def _(self, value: bytes) -> str:
+        return _base64encode(value)
+
+    @_human_string.register(int)
+    def _(self, value: int) -> str:
+        return self._int_to_human_string(self._type, value)
+
+    @singledispatchmethod
+    def _int_to_human_string(self, value_type: IcebergType, value: int) -> str:
+        return str(value)
+
+    @_int_to_human_string.register(DateType)
+    def _(self, value_type: IcebergType, value: int) -> str:
+        return datetime.to_human_day(value)
+
+    @_int_to_human_string.register(TimeType)
+    def _(self, value_type: IcebergType, value: int) -> str:
+        return datetime.to_human_time(value)
+
+    @_int_to_human_string.register(TimestampType)
+    def _(self, value_type: IcebergType, value: int) -> str:
+        return datetime.to_human_timestamp(value)
+
+    @_int_to_human_string.register(TimestamptzType)
+    def _(self, value_type: IcebergType, value: int) -> str:
+        return datetime.to_human_timestamptz(value)
+
+
 class UnknownTransform(Transform):
     """A transform that represents when an unknown transform is provided
     Args:
@@ -294,5 +371,9 @@ def bucket(source_type: IcebergType, num_buckets: int) -> BaseBucketTransform:
         raise ValueError(f"Cannot bucket by type: {source_type}")
 
 
+def identity(source_type: IcebergType) -> IdentityTransform:
+    return IdentityTransform(source_type)
+
+
 def always_null() -> Transform:
     return VoidTransform()
diff --git a/python/src/iceberg/utils/datetime.py b/python/src/iceberg/utils/datetime.py
index c8c12393b..c10dda4fb 100644
--- a/python/src/iceberg/utils/datetime.py
+++ b/python/src/iceberg/utils/datetime.py
@@ -17,7 +17,12 @@
 """Helper methods for working with date/time representations
 """
 import re
-from datetime import date, datetime, time
+from datetime import (
+    date,
+    datetime,
+    time,
+    timedelta,
+)
 
 EPOCH_DATE = date.fromisoformat("1970-01-01")
 EPOCH_TIMESTAMP = datetime.fromisoformat("1970-01-01T00:00:00.000000")
@@ -42,6 +47,13 @@ def time_to_micros(time_str: str) -> int:
     return (((t.hour * 60 + t.minute) * 60) + t.second) * 1_000_000 + t.microsecond
 
 
+def time_from_micros(micros: int) -> time:
+    seconds = micros // 1_000_000
+    minutes = seconds // 60
+    hours = minutes // 60
+    return time(hour=hours, minute=minutes % 60, second=seconds % 60, microsecond=micros % 1_000_000)
+
+
 def datetime_to_micros(dt: datetime) -> int:
     """Converts a datetime to microseconds from 1970-01-01T00:00:00.000000"""
     if dt.tzinfo:
@@ -63,3 +75,23 @@ def timestamptz_to_micros(timestamptz_str: str) -> int:
     if ISO_TIMESTAMPTZ.fullmatch(timestamptz_str):
         return datetime_to_micros(datetime.fromisoformat(timestamptz_str))
     raise ValueError(f"Invalid timestamp with zone: {timestamptz_str} (must be ISO-8601)")
+
+
+def to_human_day(day_ordinal: int) -> str:
+    """Converts a DateType value to human string"""
+    return (EPOCH_DATE + timedelta(days=day_ordinal)).isoformat()
+
+
+def to_human_time(micros_from_midnight: int) -> str:
+    """Converts a TimeType value to human string"""
+    return time_from_micros(micros_from_midnight).isoformat()
+
+
+def to_human_timestamptz(timestamp_micros: int) -> str:
+    """Converts a TimestamptzType value to human string"""
+    return (EPOCH_TIMESTAMPTZ + timedelta(microseconds=timestamp_micros)).isoformat()
+
+
+def to_human_timestamp(timestamp_micros: int) -> str:
+    """Converts a TimestampType value to human string"""
+    return (EPOCH_TIMESTAMP + timedelta(microseconds=timestamp_micros)).isoformat()
diff --git a/python/tests/test_transforms.py b/python/tests/test_transforms.py
index 7855345f9..d7b8e968a 100644
--- a/python/tests/test_transforms.py
+++ b/python/tests/test_transforms.py
@@ -28,7 +28,9 @@ from iceberg.types import (
     BooleanType,
     DateType,
     DecimalType,
+    DoubleType,
     FixedType,
+    FloatType,
     IntegerType,
     LongType,
     StringType,
@@ -129,6 +131,52 @@ def test_string_with_surrogate_pair():
     assert bucket_transform.hash(string_with_surrogate_pair) == mmh3.hash(as_bytes)
 
 
+@pytest.mark.parametrize(
+    "type_var,value,expected",
+    [
+        (LongType(), None, "null"),
+        (DateType(), 17501, "2017-12-01"),
+        (TimeType(), 36775038194, "10:12:55.038194"),
+        (TimestamptzType(), 1512151975038194, "2017-12-01T18:12:55.038194+00:00"),
+        (TimestampType(), 1512151975038194, "2017-12-01T18:12:55.038194"),
+        (LongType(), -1234567890000, "-1234567890000"),
+        (StringType(), "a/b/c=d", "a/b/c=d"),
+        (DecimalType(9, 2), Decimal("-1.50"), "-1.50"),
+        (FixedType(100), b"foo", "Zm9v"),
+    ],
+)
+def test_identity_human_string(type_var, value, expected):
+    identity = transforms.identity(type_var)
+    assert identity.to_human_string(value) == expected
+
+
+@pytest.mark.parametrize(
+    "type_var",
+    [
+        BinaryType(),
+        BooleanType(),
+        DateType(),
+        DecimalType(8, 2),
+        DoubleType(),
+        FixedType(16),
+        FloatType(),
+        IntegerType(),
+        LongType(),
+        StringType(),
+        TimestampType(),
+        TimestamptzType(),
+        TimeType(),
+        UUIDType(),
+    ],
+)
+def test_identity_method(type_var):
+    identity_transform = transforms.identity(type_var)
+    assert str(identity_transform) == str(eval(repr(identity_transform)))
+    assert identity_transform.can_transform(type_var)
+    assert identity_transform.result_type(type_var) == type_var
+    assert identity_transform.apply("test") == "test"
+
+
 def test_unknown_transform():
     unknown_transform = transforms.UnknownTransform(FixedType(8), "unknown")
     assert str(unknown_transform) == str(eval(repr(unknown_transform)))