You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by we...@apache.org on 2018/11/15 13:47:19 UTC

[arrow] branch master updated: ARROW-3703: [Python] DataFrame.to_parquet crashes if datetime column has time zones

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 6e46bdc  ARROW-3703: [Python] DataFrame.to_parquet crashes if datetime column has time zones
6e46bdc is described below

commit 6e46bdc9a354ebb15644e99a80f6cc07bb440b21
Author: Krisztián Szűcs <sz...@gmail.com>
AuthorDate: Thu Nov 15 08:47:11 2018 -0500

    ARROW-3703: [Python] DataFrame.to_parquet crashes if datetime column has time zones
    
    Author: Krisztián Szűcs <sz...@gmail.com>
    
    Closes #2975 from kszucs/ARROW-3703 and squashes the following commits:
    
    dba35f267 <Krisztián Szűcs> more robust timezone to string conversion
---
 python/pyarrow/tests/test_convert_pandas.py | 28 ++++++++++++++++++++++++++++
 python/pyarrow/tests/test_parquet.py        | 11 +++++++++++
 python/pyarrow/types.pxi                    | 26 ++++++++++++++++++++++----
 3 files changed, 61 insertions(+), 4 deletions(-)

diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 0a0a524..7f672ea 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -15,6 +15,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+
+import six
 import decimal
 import json
 import multiprocessing as mp
@@ -26,6 +28,7 @@ import numpy.testing as npt
 import pandas as pd
 import pandas.util.testing as tm
 import pytest
+import pytz
 
 import pyarrow as pa
 import pyarrow.types as patypes
@@ -823,6 +826,31 @@ class TestConvertDateTimeLikeTypes(object):
         })
         tm.assert_frame_equal(expected_df, result)
 
+    def test_python_datetime_with_pytz_tzinfo(self):
+        for tz in [pytz.utc, pytz.timezone('US/Eastern'), pytz.FixedOffset(1)]:
+            values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz)]
+            df = pd.DataFrame({'datetime': values})
+            _check_pandas_roundtrip(df)
+
+    @pytest.mark.skipif(six.PY2, reason='datetime.timezone is available since '
+                                        'python version 3.2')
+    def test_python_datetime_with_timezone_tzinfo(self):
+        from datetime import timezone
+
+        values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=pytz.utc)]
+        df = pd.DataFrame({'datetime': values})
+        _check_pandas_roundtrip(df)
+
+        # datetime.timezone is going to be pytz.FixedOffset
+        hours = 1
+        tz_timezone = timezone(timedelta(hours=hours))
+        tz_pytz = pytz.FixedOffset(hours * 60)
+        values = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz_timezone)]
+        values_exp = [datetime(2018, 1, 1, 12, 23, 45, tzinfo=tz_pytz)]
+        df = pd.DataFrame({'datetime': values})
+        df_exp = pd.DataFrame({'datetime': values_exp})
+        _check_pandas_roundtrip(df, expected=df_exp)
+
     def test_python_datetime_subclass(self):
 
         class MyDatetime(datetime):
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index bacffdf..8217dd3 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -20,6 +20,7 @@ import decimal
 import io
 import json
 import os
+import six
 import pytest
 
 import numpy as np
@@ -244,6 +245,16 @@ def test_pandas_parquet_datetime_tz():
     tm.assert_frame_equal(df, df_read)
 
 
+@pytest.mark.skipif(six.PY2, reason='datetime.timezone is available since '
+                                    'python version 3.2')
+def test_datetime_timezone_tzinfo():
+    value = datetime.datetime(2018, 1, 1, 1, 23, 45,
+                              tzinfo=datetime.timezone.utc)
+    df = pd.DataFrame({'foo': [value]})
+
+    _roundtrip_pandas_dataframe(df, write_kwargs={})
+
+
 def test_pandas_parquet_custom_metadata(tempdir):
     df = alltypes_sample(size=10000)
 
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index fb7d081..399f15e 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -962,12 +962,30 @@ def tzinfo_to_string(tz):
       name : string
         Time zone name
     """
-    if tz.zone is None:
-        sign = '+' if tz._minutes >= 0 else '-'
-        hours, minutes = divmod(abs(tz._minutes), 60)
+    import pytz
+    import datetime
+
+    def fixed_offset_to_string(offset):
+        seconds = int(offset.utcoffset(None).total_seconds())
+        sign = '+' if seconds >= 0 else '-'
+        minutes, seconds = divmod(abs(seconds), 60)
+        hours, minutes = divmod(minutes, 60)
+        if seconds > 0:
+            raise ValueError('Offset must represent whole number of minutes')
         return '{}{:02d}:{:02d}'.format(sign, hours, minutes)
-    else:
+
+    if isinstance(tz, pytz.tzinfo.BaseTzInfo):
         return tz.zone
+    elif isinstance(tz, pytz._FixedOffset):
+        return fixed_offset_to_string(tz)
+    elif isinstance(tz, datetime.tzinfo):
+        if six.PY3 and isinstance(tz, datetime.timezone):
+            return fixed_offset_to_string(tz)
+        else:
+            raise ValueError('Unable to convert timezone `{}` to string'
+                             .format(tz))
+    else:
+        raise TypeError('Must be an instance of `datetime.tzinfo`')
 
 
 def string_to_tzinfo(name):