You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by fo...@apache.org on 2023/02/28 17:17:18 UTC
[iceberg] branch master updated: Python: Fix timezone concat issue (#6946)

This is an automated email from the ASF dual-hosted git repository.

fokko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git


The following commit(s) were added to refs/heads/master by this push:
     new 049ec491b5 Python: Fix timezone concat issue (#6946)
049ec491b5 is described below

commit 049ec491b51dbcaef61e30301a14eb71a4497a5f
Author: Fokko Driesprong <fo...@apache.org>
AuthorDate: Tue Feb 28 18:17:10 2023 +0100

    Python: Fix timezone concat issue (#6946)
    
    Resolves #6945
---
 python/pyiceberg/io/pyarrow.py  | 27 ++++++++++++++++-----------
 python/tests/io/test_pyarrow.py |  2 +-
 2 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/python/pyiceberg/io/pyarrow.py b/python/pyiceberg/io/pyarrow.py
index d2cd762273..46e384bc17 100644
--- a/python/pyiceberg/io/pyarrow.py
+++ b/python/pyiceberg/io/pyarrow.py
@@ -391,7 +391,7 @@ class _ConvertToArrowSchema(SchemaVisitorPerPrimitiveType[pa.DataType], Singleto
         return pa.timestamp(unit="us")
 
     def visit_timestampz(self, _: TimestamptzType) -> pa.DataType:
-        return pa.timestamp(unit="us", tz="+00:00")
+        return pa.timestamp(unit="us", tz="UTC")
 
     def visit_string(self, _: StringType) -> pa.DataType:
         return pa.string()
@@ -477,7 +477,7 @@ def _file_to_table(
     projected_schema: Schema,
     projected_field_ids: Set[int],
     case_sensitive: bool,
-) -> pa.Table:
+) -> Optional[pa.Table]:
     _, path = PyArrowFileIO.parse_location(task.file.file_path)
 
     # Get the schema
@@ -512,10 +512,11 @@ def _file_to_table(
             columns=[col.name for col in file_project_schema.columns],
         )
 
-        if pyarrow_filter is not None:
-            arrow_table = arrow_table.filter(pyarrow_filter)
-
-        return to_requested_schema(projected_schema, file_project_schema, arrow_table)
+        # If there is no data, we don't have to go through the schema
+        if len(arrow_table) > 0:
+            return to_requested_schema(projected_schema, file_project_schema, arrow_table)
+        else:
+            return None
 
 
 def project_table(
@@ -547,11 +548,15 @@ def project_table(
     }.union(extract_field_ids(bound_row_filter))
 
     with ThreadPool() as pool:
-        tables = pool.starmap(
-            func=_file_to_table,
-            iterable=[(fs, task, bound_row_filter, projected_schema, projected_field_ids, case_sensitive) for task in tasks],
-            chunksize=None,  # we could use this to control how to materialize the generator of tasks (we should also make the expression above lazy)
-        )
+        tables = [
+            table
+            for table in pool.starmap(
+                func=_file_to_table,
+                iterable=[(fs, task, bound_row_filter, projected_schema, projected_field_ids, case_sensitive) for task in tasks],
+                chunksize=None,  # we could use this to control how to materialize the generator of tasks (we should also make the expression above lazy)
+            )
+            if table is not None
+        ]
 
     if len(tables) > 1:
         return pa.concat_tables(tables)
diff --git a/python/tests/io/test_pyarrow.py b/python/tests/io/test_pyarrow.py
index f894963d94..c07890053d 100644
--- a/python/tests/io/test_pyarrow.py
+++ b/python/tests/io/test_pyarrow.py
@@ -377,7 +377,7 @@ def test_timestamp_type_to_pyarrow() -> None:
 
 def test_timestamptz_type_to_pyarrow() -> None:
     iceberg_type = TimestamptzType()
-    assert visit(iceberg_type, _ConvertToArrowSchema()) == pa.timestamp(unit="us", tz="+00:00")
+    assert visit(iceberg_type, _ConvertToArrowSchema()) == pa.timestamp(unit="us", tz="UTC")
 
 
 def test_string_type_to_pyarrow() -> None: