You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/11/15 13:47:19 UTC
[arrow] branch master updated: ARROW-3703: [Python]
DataFrame.to_parquet crashes if datetime column has time zones
This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 6e46bdc ARROW-3703: [Python] DataFrame.to_parquet crashes if datetime column has time zones
6e46bdc is described below
commit 6e46bdc9a354ebb15644e99a80f6cc07bb440b21
Author: Krisztián Szűcs <sz...@gmail.com>
AuthorDate: Thu Nov 15 08:47:11 2018 -0500
ARROW-3703: [Python] DataFrame.to_parquet crashes if datetime column has time zones
Author: Krisztián Szűcs <sz...@gmail.com>
Closes #2975 from kszucs/ARROW-3703 and squashes the following commits:
dba35f267 <Krisztián Szűcs> more robust timezone to string conversion
---
python/pyarrow/tests/test_convert_pandas.py | 28 ++++++++++++++++++++++++++++
python/pyarrow/tests/test_parquet.py | 11 +++++++++++
python/pyarrow/types.pxi | 26 ++++++++++++++++++++++----
3 files changed, 61 insertions(+), 4 deletions(-)
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 0a0a524..7f672ea 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -15,6 +15,8 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
+
+import six
import decimal
import json
import multiprocessing as mp
@@ -26,6 +28,7 @@ import numpy.testing as npt
import pandas as pd
import pandas.util.testing as tm
import pytest
+import pytz
import pyarrow as pa
import pyarrow.types as patypes
@@ -823,6 +826,31 @@ class TestConvertDateTimeLikeTypes(object):
})
tm.assert_frame_equal(expected_df, result)
+ def test_python_datetime_with_pytz_tzinfo(self):
+ for tz in [pytz.utc, pytz.timezone('US/Eastern'), pytz.FixedOffset(1)]:
+ values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz)]
+ df = pd.DataFrame({'datetime': values})
+ _check_pandas_roundtrip(df)
+
+ @pytest.mark.skipif(six.PY2, reason='datetime.timezone is available since '
+ 'python version 3.2')
+ def test_python_datetime_with_timezone_tzinfo(self):
+ from datetime import timezone
+
+ values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=pytz.utc)]
+ df = pd.DataFrame({'datetime': values})
+ _check_pandas_roundtrip(df)
+
+ # datetime.timezone is going to be pytz.FixedOffset
+ hours = 1
+ tz_timezone = timezone(timedelta(hours=hours))
+ tz_pytz = pytz.FixedOffset(hours * 60)
+ values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz_timezone)]
+ values_exp = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz_pytz)]
+ df = pd.DataFrame({'datetime': values})
+ df_exp = pd.DataFrame({'datetime': values_exp})
+ _check_pandas_roundtrip(df, expected=df_exp)
+
def test_python_datetime_subclass(self):
class MyDatetime(datetime):
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index bacffdf..8217dd3 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -20,6 +20,7 @@ import decimal
import io
import json
import os
+import six
import pytest
import numpy as np
@@ -244,6 +245,16 @@ def test_pandas_parquet_datetime_tz():
tm.assert_frame_equal(df, df_read)
+@pytest.mark.skipif(six.PY2, reason='datetime.timezone is available since '
+ 'python version 3.2')
+def test_datetime_timezone_tzinfo():
+ value = datetime.datetime(2018, 1, 1, 1, 23, 45,
+ tzinfo=datetime.timezone.utc)
+ df = pd.DataFrame({'foo': [value]})
+
+ _roundtrip_pandas_dataframe(df, write_kwargs={})
+
+
def test_pandas_parquet_custom_metadata(tempdir):
df = alltypes_sample(size=10000)
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index fb7d081..399f15e 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -962,12 +962,30 @@ def tzinfo_to_string(tz):
name : string
Time zone name
"""
- if tz.zone is None:
- sign = '+' if tz._minutes >= 0 else '-'
- hours, minutes = divmod(abs(tz._minutes), 60)
+ import pytz
+ import datetime
+
+ def fixed_offset_to_string(offset):
+ seconds = int(offset.utcoffset(None).total_seconds())
+ sign = '+' if seconds >= 0 else '-'
+ minutes, seconds = divmod(abs(seconds), 60)
+ hours, minutes = divmod(minutes, 60)
+ if seconds > 0:
+ raise ValueError('Offset must represent whole number of minutes')
return '{}{:02d}:{:02d}'.format(sign, hours, minutes)
- else:
+
+ if isinstance(tz, pytz.tzinfo.BaseTzInfo):
return tz.zone
+ elif isinstance(tz, pytz._FixedOffset):
+ return fixed_offset_to_string(tz)
+ elif isinstance(tz, datetime.tzinfo):
+ if six.PY3 and isinstance(tz, datetime.timezone):
+ return fixed_offset_to_string(tz)
+ else:
+ raise ValueError('Unable to convert timezone `{}` to string'
+ .format(tz))
+ else:
+ raise TypeError('Must be an instance of `datetime.tzinfo`')
def string_to_tzinfo(name):