You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@iceberg.apache.org by fo...@apache.org on 2023/02/28 17:17:18 UTC
[iceberg] branch master updated: Python: Fix timezone concat issue (#6946)
This is an automated email from the ASF dual-hosted git repository.
fokko pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/iceberg.git
The following commit(s) were added to refs/heads/master by this push:
new 049ec491b5 Python: Fix timezone concat issue (#6946)
049ec491b5 is described below
commit 049ec491b51dbcaef61e30301a14eb71a4497a5f
Author: Fokko Driesprong <fo...@apache.org>
AuthorDate: Tue Feb 28 18:17:10 2023 +0100
Python: Fix timezone concat issue (#6946)
Resolves #6945
---
python/pyiceberg/io/pyarrow.py | 27 ++++++++++++++++-----------
python/tests/io/test_pyarrow.py | 2 +-
2 files changed, 17 insertions(+), 12 deletions(-)
diff --git a/python/pyiceberg/io/pyarrow.py b/python/pyiceberg/io/pyarrow.py
index d2cd762273..46e384bc17 100644
--- a/python/pyiceberg/io/pyarrow.py
+++ b/python/pyiceberg/io/pyarrow.py
@@ -391,7 +391,7 @@ class _ConvertToArrowSchema(SchemaVisitorPerPrimitiveType[pa.DataType], Singleto
return pa.timestamp(unit="us")
def visit_timestampz(self, _: TimestamptzType) -> pa.DataType:
- return pa.timestamp(unit="us", tz="+00:00")
+ return pa.timestamp(unit="us", tz="UTC")
def visit_string(self, _: StringType) -> pa.DataType:
return pa.string()
@@ -477,7 +477,7 @@ def _file_to_table(
projected_schema: Schema,
projected_field_ids: Set[int],
case_sensitive: bool,
-) -> pa.Table:
+) -> Optional[pa.Table]:
_, path = PyArrowFileIO.parse_location(task.file.file_path)
# Get the schema
@@ -512,10 +512,11 @@ def _file_to_table(
columns=[col.name for col in file_project_schema.columns],
)
- if pyarrow_filter is not None:
- arrow_table = arrow_table.filter(pyarrow_filter)
-
- return to_requested_schema(projected_schema, file_project_schema, arrow_table)
+ # If there is no data, we don't have to go through the schema
+ if len(arrow_table) > 0:
+ return to_requested_schema(projected_schema, file_project_schema, arrow_table)
+ else:
+ return None
def project_table(
@@ -547,11 +548,15 @@ def project_table(
}.union(extract_field_ids(bound_row_filter))
with ThreadPool() as pool:
- tables = pool.starmap(
- func=_file_to_table,
- iterable=[(fs, task, bound_row_filter, projected_schema, projected_field_ids, case_sensitive) for task in tasks],
- chunksize=None, # we could use this to control how to materialize the generator of tasks (we should also make the expression above lazy)
- )
+ tables = [
+ table
+ for table in pool.starmap(
+ func=_file_to_table,
+ iterable=[(fs, task, bound_row_filter, projected_schema, projected_field_ids, case_sensitive) for task in tasks],
+ chunksize=None, # we could use this to control how to materialize the generator of tasks (we should also make the expression above lazy)
+ )
+ if table is not None
+ ]
if len(tables) > 1:
return pa.concat_tables(tables)
diff --git a/python/tests/io/test_pyarrow.py b/python/tests/io/test_pyarrow.py
index f894963d94..c07890053d 100644
--- a/python/tests/io/test_pyarrow.py
+++ b/python/tests/io/test_pyarrow.py
@@ -377,7 +377,7 @@ def test_timestamp_type_to_pyarrow() -> None:
def test_timestamptz_type_to_pyarrow() -> None:
iceberg_type = TimestamptzType()
- assert visit(iceberg_type, _ConvertToArrowSchema()) == pa.timestamp(unit="us", tz="+00:00")
+ assert visit(iceberg_type, _ConvertToArrowSchema()) == pa.timestamp(unit="us", tz="UTC")
def test_string_type_to_pyarrow() -> None: