You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@iceberg.apache.org by GitBox <gi...@apache.org> on 2022/08/08 10:11:25 UTC

[GitHub] [iceberg] Fokko commented on a diff in pull request #5462: [Python] add time (year, month, day, hour) transforms

Fokko commented on code in PR #5462:
URL: https://github.com/apache/iceberg/pull/5462#discussion_r940066857


##########
python/pyiceberg/transforms.py:
##########
@@ -215,6 +227,243 @@ def __repr__(self) -> str:
         return f"BucketTransform(num_buckets={self._num_buckets})"
 
 
+class YearTransform(Transform[S, int]):
+    """Transforms a datetime value into a year value.
+
+    Example:
+        >>> transform = YearTransform()
+        >>> transform.transform(TimestampType())(1512151975038194)
+        47
+    """
+
+    __root__: Literal["year"] = Field(default="year")
+    _source_type: IcebergType = PrivateAttr()
+
+    def transform(self, source: IcebergType) -> Callable[[Optional[S]], Optional[int]]:
+        source_type = type(source)
+        if source_type == DateType:
+
+            def year_func(v):
+                return datetime.days_to_years(v)
+
+        elif source_type in {TimestampType, TimestamptzType}:
+
+            def year_func(v):
+                return datetime.micros_to_years(v)
+
+        else:
+            raise ValueError(f"Cannot apply year transform for type: {source}")
+
+        return lambda v: year_func(v) if v else None
+
+    def can_transform(self, source: IcebergType) -> bool:
+        return type(source) in {
+            DateType,
+            TimestampType,
+            TimestamptzType,
+        }
+
+    def result_type(self, source: IcebergType) -> IcebergType:
+        return IntegerType()
+
+    @property
+    def preserves_order(self) -> bool:
+        return True
+
+    @property
+    def time_order(self) -> int:
+        return 3
+
+    def satisfies_order_of(self, other: Transform) -> bool:
+        return self.time_order <= other.time_order if hasattr(other, "time_order") else False
+
+    def to_human_string(self, _: IcebergType, value: Optional[S]) -> str:
+        return datetime.to_human_year(value) if isinstance(value, int) else "null"
+
+    @property
+    def dedup_name(self) -> str:
+        return "time"
+
+    def __repr__(self) -> str:
+        return "YearTransform()"
+
+
+class MonthTransform(Transform[S, int]):
+    """Transforms a datetime value into a month value.
+
+    Example:
+        >>> transform = MonthTransform()
+        >>> transform.transform(DateType())(17501)
+        575
+    """
+
+    __root__: Literal["month"] = Field(default="month")
+    _source_type: IcebergType = PrivateAttr()
+
+    def transform(self, source: IcebergType) -> Callable[[Optional[S]], Optional[int]]:
+        source_type = type(source)
+        if source_type == DateType:
+
+            def month_func(v):
+                return datetime.days_to_months(v)
+
+        elif source_type in {TimestampType, TimestamptzType}:
+
+            def month_func(v):
+                return datetime.micros_to_months(v)
+
+        else:
+            raise ValueError(f"Cannot apply month transform for type: {source}")
+
+        return lambda v: month_func(v) if v else None
+
+    def can_transform(self, source: IcebergType) -> bool:
+        return type(source) in {
+            DateType,
+            TimestampType,
+            TimestamptzType,
+        }
+
+    def result_type(self, source: IcebergType) -> IcebergType:
+        return IntegerType()
+
+    @property
+    def preserves_order(self) -> bool:
+        return True
+
+    @property
+    def time_order(self) -> int:
+        return 2
+
+    def satisfies_order_of(self, other: Transform) -> bool:
+        return self.time_order <= other.time_order if hasattr(other, "time_order") else False
+
+    def to_human_string(self, _: IcebergType, value: Optional[S]) -> str:
+        return datetime.to_human_month(value) if isinstance(value, int) else "null"
+
+    @property
+    def dedup_name(self) -> str:
+        return "time"
+
+    def __repr__(self) -> str:
+        return "MonthTransform()"
+
+
+class DayTransform(Transform[S, int]):
+    """Transforms a datetime value into a day value.
+
+    Example:
+        >>> transform = MonthTransform()
+        >>> transform.transform(DateType())(17501)
+        17501
+    """
+
+    __root__: Literal["day"] = Field(default="day")
+    _source_type: IcebergType = PrivateAttr()
+
+    def transform(self, source: IcebergType) -> Callable[[Optional[S]], Optional[int]]:
+        source_type = type(source)
+        if source_type == DateType:
+
+            def day_func(v):
+                return v
+
+        elif source_type in {TimestampType, TimestamptzType}:
+
+            def day_func(v):
+                return datetime.micros_to_days(v)
+
+        else:
+            raise ValueError(f"Cannot apply day transform for type: {source}")
+
+        return lambda v: day_func(v) if v else None
+
+    def can_transform(self, source: IcebergType) -> bool:
+        return type(source) in {
+            DateType,
+            TimestampType,
+            TimestamptzType,
+        }
+
+    def result_type(self, source: IcebergType) -> IcebergType:
+        return DateType()
+
+    @property
+    def preserves_order(self) -> bool:
+        return True
+
+    @property
+    def time_order(self) -> int:
+        return 1
+
+    def satisfies_order_of(self, other: Transform) -> bool:
+        return self.time_order <= other.time_order if hasattr(other, "time_order") else False
+
+    def to_human_string(self, _: IcebergType, value: Optional[S]) -> str:
+        return datetime.to_human_day(value) if isinstance(value, int) else "null"
+
+    @property
+    def dedup_name(self) -> str:
+        return "time"
+
+    def __repr__(self) -> str:
+        return "DayTransform()"
+
+
+class HourTransform(Transform[S, int]):
+    """Transforms a datetime value into a hour value.
+
+    Example:
+        >>> transform = HourTransform()
+        >>> transform.transform(TimestampType())(1512151975038194)
+        420042
+    """
+
+    __root__: Literal["hour"] = Field(default="hour")
+    _source_type: IcebergType = PrivateAttr()
+
+    def transform(self, source: IcebergType) -> Callable[[Optional[S]], Optional[int]]:
+        if type(source) in {TimestampType, TimestamptzType}:
+
+            def hour_func(v):
+                return datetime.micros_to_hours(v)
+
+        else:
+            raise ValueError(f"Cannot apply hour transform for type: {source}")
+
+        return lambda v: hour_func(v) if v else None
+
+    def can_transform(self, source: IcebergType) -> bool:
+        return type(source) in {
+            TimestampType,
+            TimestamptzType,
+        }
+
+    def result_type(self, source: IcebergType) -> IcebergType:
+        return IntegerType()
+
+    @property
+    def preserves_order(self) -> bool:
+        return True
+
+    @property
+    def time_order(self) -> int:
+        return 0

Review Comment:
   Instead of using these numbers, what do you think of adding an `IntEnum` to make it a bit more readable?
   ```
   ➜  python git:(fd-add-http-code-to-error-message) ✗ python3
   Python 3.9.13 (main, May 24 2022, 21:13:51) 
   [Clang 13.1.6 (clang-1316.0.21.2)] on darwin
   Type "help", "copyright", "credits" or "license" for more information.
   >>> from enum import IntEnum
   >>> 
   >>> class TimeResolution(IntEnum):
   ...     YEAR = 6
   ...     MONTH = 5
   ...     WEEK = 4
   ...     DAY = 3
   ...     HOUR = 2
   ...     MINUTE = 1
   ...     SECOND = 0
   ... 
   >>> 
   >>> TimeResolution.YEAR <= TimeResolution.MONTH
   False
   ```



##########
python/tests/test_transforms.py:
##########
@@ -318,3 +420,31 @@ def test_void_transform_str():
 
 def test_void_transform_repr():
     assert repr(VoidTransform()) == "VoidTransform()"
+
+
+@pytest.mark.parametrize(
+    "transform,json",
+    [
+        (YearTransform(), '"year"'),
+        (MonthTransform(), '"month"'),
+        (DayTransform(), '"day"'),
+        (HourTransform(), '"hour"'),
+    ],
+)
+def test_datetime_transform_serde(transform, json):
+    assert transform.json() == json
+    assert TestType.parse_raw(json).__root__ == transform
+
+
+@pytest.mark.parametrize(

Review Comment:
   A bit of a nit, but I think this test both tests the str and repr, which I think makes sense to split into two tests (one for `repr` and one for `str`.
   
   We haven't discussed this, but I'm not a fan of the whole `parameterize` functionality in pytest mostly because the decorator initializes before the actual tests ran. For example, if there is an error in the `YearTransform()` constructor, it will fail directly. This conflicts a bit with my personal TDD approach to development.
   
   Therefore I just split everything into separate tests: https://github.com/apache/iceberg/blob/master/python/tests/test_types.py#L546
   This way you don't have to check on which input the test is failing.
   
   I've also added this to the list of Python topics for the next community sync.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscribe@iceberg.apache.org
For additional commands, e-mail: issues-help@iceberg.apache.org