You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@arrow.apache.org by uw...@apache.org on 2018/04/03 04:39:28 UTC

[arrow] branch master updated: ARROW-2122: [Python] Pyarrow fails to serialize dataframe with timestamp.

This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new fff992a  ARROW-2122: [Python] Pyarrow fails to serialize dataframe with timestamp.
fff992a is described below

commit fff992a048c4e4feb241772047f89f48ebe6201c
Author: Albert Shieh <as...@ansatzcapital.com>
AuthorDate: Tue Apr 3 06:39:21 2018 +0200

    ARROW-2122: [Python] Pyarrow fails to serialize dataframe with timestamp.
    
    Fixes [ARROW-2122](https://issues.apache.org/jira/browse/ARROW-2122).
    
    Author: Albert Shieh <as...@ansatzcapital.com>
    
    Closes #1707 from adshieh/ARROW-2122 and squashes the following commits:
    
    2fae121 <Albert Shieh> No trailing characters
    1c5e093 <Albert Shieh> Change fixed offset timezone format and add tests
    71c5c47 <Albert Shieh> Fixed offset timezone handling
---
 python/pyarrow/pandas_compat.py              |  5 ++-
 python/pyarrow/scalar.pxi                    |  3 +-
 python/pyarrow/table.pxi                     |  6 ++-
 python/pyarrow/tests/test_convert_builtin.py | 12 ++++++
 python/pyarrow/tests/test_convert_pandas.py  | 10 +++++
 python/pyarrow/types.pxi                     | 61 +++++++++++++++++++++++++++-
 6 files changed, 90 insertions(+), 7 deletions(-)

diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 97ea51d..24da744 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -129,7 +129,7 @@ def get_extension_dtype_info(column):
         }
         physical_dtype = str(cats.codes.dtype)
     elif hasattr(dtype, 'tz'):
-        metadata = {'timezone': str(dtype.tz)}
+        metadata = {'timezone': pa.lib.tzinfo_to_string(dtype.tz)}
         physical_dtype = 'datetime64[ns]'
     else:
         metadata = None
@@ -419,7 +419,7 @@ def dataframe_to_serialized_dict(frame):
         block_data = {}
 
         if isinstance(block, _int.DatetimeTZBlock):
-            block_data['timezone'] = values.tz.zone
+            block_data['timezone'] = pa.lib.tzinfo_to_string(values.tz)
             values = values.values
         elif isinstance(block, _int.CategoricalBlock):
             block_data.update(dictionary=values.categories,
@@ -483,6 +483,7 @@ def _reconstruct_block(item):
 
 def _make_datetimetz(tz):
     from pyarrow.compat import DatetimeTZDtype
+    tz = pa.lib.string_to_tzinfo(tz)
     return DatetimeTZDtype('ns', tz=tz)
 
 
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index 6784e56..e152771 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -237,8 +237,7 @@ cdef class TimestampValue(ArrayValue):
         value = self.value
 
         if not dtype.timezone().empty():
-            import pytz
-            tzinfo = pytz.timezone(frombytes(dtype.timezone()))
+            tzinfo = string_to_tzinfo(frombytes(dtype.timezone()))
         else:
             tzinfo = None
 
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 672b9fb..cbf2a69 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -344,9 +344,11 @@ cdef class Column:
         result = pd.Series(values, name=self.name)
 
         if isinstance(self.type, TimestampType):
-            if self.type.tz is not None:
+            tz = self.type.tz
+            if tz is not None:
+                tz = string_to_tzinfo(tz)
                 result = (result.dt.tz_localize('utc')
-                          .dt.tz_convert(self.type.tz))
+                          .dt.tz_convert(tz))
 
         return result
 
diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py
index 19b59a4..988d512 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -26,6 +26,7 @@ import decimal
 import itertools
 import numpy as np
 import six
+import pytz
 
 
 int_type_pairs = [
@@ -649,3 +650,14 @@ def test_decimal_array_with_none_and_nan():
 
     array = pa.array(values, type=pa.decimal128(10, 4))
     assert array.to_pylist() == [decimal.Decimal('1.2340'), None, None, None]
+
+
+@pytest.mark.parametrize('tz,name', [
+    (pytz.FixedOffset(90), '+01:30'),
+    (pytz.FixedOffset(-90), '-01:30'),
+    (pytz.utc, 'UTC'),
+    (pytz.timezone('America/New_York'), 'America/New_York')
+])
+def test_timezone_string(tz, name):
+    assert pa.lib.tzinfo_to_string(tz) == name
+    assert pa.lib.string_to_tzinfo(name) == tz
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 45ec66d..d4d3294 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -1039,6 +1039,16 @@ class TestConvertDateTimeLikeTypes(object):
         expected = pd.Series([None, date(1991, 1, 1), None])
         assert pa.Array.from_pandas(expected).equals(result)
 
+    def test_fixed_offset_timezone(self):
+        df = pd.DataFrame({
+            'a': [
+                pd.Timestamp('2012-11-11 00:00:00+01:00'),
+                pd.NaT
+                ]
+             })
+        _check_pandas_roundtrip(df)
+        _check_serialize_components_roundtrip(df)
+
 
 class TestConvertStringLikeTypes(object):
     """
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index b0557eb..2abdb30 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import re
+
 # These are imprecise because the type (in pandas 0.x) depends on the presence
 # of nulls
 cdef dict _pandas_type_map = {
@@ -840,6 +842,63 @@ cdef timeunit_to_string(TimeUnit unit):
         return 'ns'
 
 
+_FIXED_OFFSET_RE = re.compile(r'([+-])(0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])$')
+
+
+def tzinfo_to_string(tz):
+    """
+    Converts a time zone object into a string indicating the name of a time
+    zone, one of:
+    * As used in the Olson time zone database (the "tz database" or
+      "tzdata"), such as "America/New_York"
+    * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
+
+    Parameters
+    ----------
+      tz : datetime.tzinfo
+        Time zone object
+
+    Returns
+    -------
+      name : string
+        Time zone name
+    """
+    if tz.zone is None:
+        sign = '+' if tz._minutes >= 0 else '-'
+        hours, minutes = divmod(abs(tz._minutes), 60)
+        return '{}{:02d}:{:02d}'.format(sign, hours, minutes)
+    else:
+        return tz.zone
+
+
+def string_to_tzinfo(name):
+    """
+    Converts a string indicating the name of a time zone into a time zone
+    object, one of:
+    * As used in the Olson time zone database (the "tz database" or
+      "tzdata"), such as "America/New_York"
+    * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
+
+    Parameters
+    ----------
+      name: string
+        Time zone name
+
+    Returns
+    -------
+      tz : datetime.tzinfo
+        Time zone object
+    """
+    import pytz
+    m = _FIXED_OFFSET_RE.match(name)
+    if m:
+        sign = 1 if m.group(1) == '+' else -1
+        hours, minutes = map(int, m.group(2, 3))
+        return pytz.FixedOffset(sign * (hours * 60 + minutes))
+    else:
+        return pytz.timezone(name)
+
+
 def timestamp(unit, tz=None):
     """
     Create instance of timestamp type with resolution and optional time zone
@@ -887,7 +946,7 @@ def timestamp(unit, tz=None):
         _timestamp_type_cache[unit_code] = out
     else:
         if not isinstance(tz, six.string_types):
-            tz = tz.zone
+            tz = tzinfo_to_string(tz)
 
         c_timezone = tobytes(tz)
         out.init(ctimestamp(unit_code, c_timezone))

-- 
To stop receiving notification emails like this one, please contact
uwe@apache.org.